Description

This R markdown file contains all of the analyses for the KOSMOS paper. For any questions about the analysis, please contact me at .

Load libraries

Sample map (Figure 1)

Load base shapefiles

# Get world map
world_df <- map_data("world")

# pull out only peru
peru_df <- world_df[world_df$region == "Peru",]

# Load data
# peru_spdf <- readOGR(dsn = "/Users/mmin/Documents/KOSMOS_data_local/map_files/PER_adm/PER_adm0.shp")
peru_spdf <- readOGR(dsn = here::here("data", "map_files", "PER_adm", "PER_adm0.shp"))
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/markusmin/Documents/MBARI-2167/KOSMOS_eDNA_paper/KOSMOS_eDNA_paper/data/map_files/PER_adm/PER_adm0.shp", layer: "PER_adm0"
## with 1 features
## It has 70 fields
## Integer64 fields read as strings:  ID_0 OBJECTID_1
# Convert to df(ish)
peru_spdf_fort <- tidy(peru_spdf)
## Regions defined for each Polygons

Plot map in ggplot

peru_map <- ggplot(data = world_df, aes(x = long, y = lat, group = group))+
  geom_polygon(fill = "gray60", color = "grey40")+
  geom_polygon(data = peru_df,aes(x = long, y = lat, group = group, fill = region), color = "black")+
  scale_fill_manual(values = c("white"))+
  coord_fixed(xlim = c(-90, -68),  ylim = c(-29,1), ratio = 1.3)+
  theme(legend.position = "none",
        panel.background = element_rect(fill="#c6dbef"),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_rect(colour = "black", fill=NA, size=2),
        axis.text = element_text(size = 13,face = "bold"),
        axis.title = element_text(size = 15, face = "bold"))+
  ylab("Latitude")+
  xlab("Longitude")+
  scale_x_continuous(breaks = c(-90,-85,-80,-75,-70), expand = c(0,0), labels=c(expression(paste(90*degree,"W")),expression(paste(85*degree,"W")),
    expression(paste(80*degree,"W")),expression(paste(75*degree,"W")),
    expression(paste(70*degree,"W"))))+
  scale_y_continuous(breaks = seq(-25,0,5), expand = c(0,0), labels=c(expression(paste(25*degree,"S")),
    expression(paste(20*degree,"S")),expression(paste(15*degree,"S")),
    expression(paste(10*degree,"S")),expression(paste(5*degree,"S")),expression(paste(0*degree))))+
  annotate("rect", xmin = -77.6, xmax = -76.8, ymin = -12.5, ymax = -11.9,color="firebrick3", fill = NA)+
  annotate("text",label = "Peru",x = -75.5, y = -10,size = 8)+
  annotate("text", label = " La Punta \n (Callao) ",x = -86, y = -12.2, size = 8)+
  geom_segment(aes(x = -83, y = -12.2, xend = -78, yend = -12.2), colour='black', size=1,arrow = arrow(length = unit(0.25, "cm"),type = "closed"))+
  # callout to mesocosm setup
  annotate("segment",x = -77.6, y = -12.5, xend = -87, yend = -15.5, color = "firebrick3", lty = 2)+
  annotate("segment",x = -76.8, y = -12.5, xend = -78, yend = -15.5, color = "firebrick3", lty = 2)+
  # mesocosm arrangement - manually because I stink at for loops
  # Box
  annotate("rect", xmin = -87, xmax = -78, ymin = -15.5, ymax = -27.5,color="firebrick3",fill=NA, lty = 3)+ #background
  # Left row
  annotate("point", x = -85, y = -17, shape = 21, fill = "white", size = 12, color = "firebrick3",stroke = 2)+ # M1
  annotate("text", x = -85, y = -17,label = "M1", color = "black",size = 5)+ # M1
  annotate("point", x = -85, y = -20, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M3
  annotate("text", x = -85, y = -20,label = "M3", color = "black", alpha = 0.5)+ # M3
  annotate("point", x = -85, y = -23, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M5
  annotate("text", x = -85, y = -23,label = "M5", color = "black", alpha = 0.5)+ # M5
  annotate("point", x = -85, y = -26, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M7
  annotate("text", x = -85, y = -26, label = "M7", color = "black", alpha = 0.5)+ # M7
  # Right row
  annotate("point", x = -80, y = -17, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M2
  annotate("text", x = -80, y = -17,label = "M2", color = "black", alpha = 0.5)+ # M2
  annotate("point", x = -80, y = -20, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M4
  annotate("text", x = -80, y = -20,label = "M4", color = "black", alpha = 0.5)+ # M4
  annotate("point", x = -80, y = -23, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M6
  annotate("text", x = -80, y = -23,label = "M6", color = "black", alpha = 0.5)+ # M6
  annotate("point", x = -80, y = -26, shape = 21, fill = "white", size = 12, stroke = 1, alpha = 0.5)+ # M8
  annotate("text", x = -80, y = -26, label = "M8", color = "black", alpha = 0.5)+ # M8
  # Pacific
  # annotate("point", x = -77.3175, y = -12.0875, shape = 21, fill = "white",color = "white", size = 15, stroke = 1)+ # Pacific
  annotate("text", x = -82.5, y = -21.5,label = "bold(Pacific)", color = "darkblue",size = 6, parse = TRUE) # Pacific

peru_map

ggsave("kosmos_map.pdf", width = 5, height = 8,path = here::here("figures"))

Biogeochemical data (Figure 2, first column)

Import data

M_nutrients <- read.csv(file = here::here("data", "kosmos_physical_data", "Figure_4_Francisco.csv"))
M_chla_long <- read.csv(file = here::here("data", "kosmos_physical_data", "chla_Francisco.csv"))
kosmos_T <- read.csv(file = here::here("data", "kosmos_physical_data", "kosmos_temperature_all.csv"))
kosmos_S <- read.csv(file = here::here("data", "kosmos_physical_data", "kosmos_salinity_all.csv"))


# Convert all five datasets into long form for plots
# Separate out long form data into each nutrient separately
NOx_long <- subset(M_nutrients, parameter %in% c("NOx"))
NOx_long %>% mutate(mesocosm = paste0("M",mesocosm)) %>% mutate(mesocosm = ifelse(mesocosm == "M10","Pacific",mesocosm)) -> NOx_long
SiO4_long <-subset(M_nutrients, parameter %in% c("SiO4"))
SiO4_long %>% mutate(mesocosm = paste0("M",mesocosm)) %>% mutate(mesocosm = ifelse(mesocosm == "M10","Pacific",mesocosm)) -> SiO4_long
rownames(M_chla_long) = NULL
chla_long <- M_chla_long
chla_long %>% mutate(mesocosm = paste0("M",mesocosm)) %>% mutate(mesocosm = ifelse(mesocosm == "M10","Pacific",mesocosm)) -> chla_long
T_long <- kosmos_T %>% gather(.,mesocosm, value, M1:Pacific)
parameter = rep("temp",nrow(T_long))
T_long %>% dplyr::rename(.,sampling.day = day) %>% data.frame(.,parameter) -> T_long
S_long <- kosmos_S %>% gather(.,mesocosm, value, M1:Pacific)
parameter = rep("sal",nrow(S_long))
S_long %>% dplyr::rename(.,sampling.day = day) %>% data.frame(.,parameter) -> S_long

# Concat the different variables, export as one dataframe for inclusion with publication
combined_physicochemical_data <- bind_rows(bind_rows(bind_rows(bind_rows(T_long,S_long),NOx_long),SiO4_long),chla_long)
write.csv(combined_physicochemical_data, here::here("data", "kosmos_physical_data", "Figure_2_data.csv"))

Prep dataframes using the combined dataset

combined_physicochemical_data <- read.csv(here::here("data", "kosmos_physical_data", "Figure_2_data.csv"),row.names = 1)
combined_physicochemical_data$mesocosm <- as.character(combined_physicochemical_data$mesocosm)
combined_physicochemical_data$parameter <- as.character(combined_physicochemical_data$parameter)
T_long <- subset(combined_physicochemical_data,parameter == "temp")
S_long <- subset(combined_physicochemical_data,parameter == "sal")
NOx_long <- subset(combined_physicochemical_data,parameter == "NOx")
SiO4_long <- subset(combined_physicochemical_data,parameter == "SiO4")
chla_long <- subset(combined_physicochemical_data,parameter == "chla")

Prepare data for ggplot in a for loop

variable_names <- c("NOx","SiO4","chla","T","S")
for (i in seq(1:length(variable_names))){
  df <- pivot_wider(eval(parse(text = paste0(variable_names[i],"_long"))),names_from = mesocosm, values_from = value)
  colnames(df) <- c("sampling.day","parameter","M1","M2","M3","M4","M5","M6","M7","M8","Pacific")
  # convert to long form
  df_long <- df %>% gather(.,station, value, M1:M8)
  # drop Pacific values
  df_long <- df_long[,!(names(df_long) %in% c("Pacific"))]
  # Sumamrize data to calculate the mean + SE
  df_meanSE_data <- summarySE(df_long, measurevar="value", groupvars=c("sampling.day"),na.rm = TRUE)
  # Add in M1 and Pacific to this data
  complete_df <- left_join(df[,c("sampling.day", "M1","Pacific")],df_meanSE_data)
  # Rename some columns
  complete_df %>% dplyr::rename(., M1_M8_mean = value, M1_M8_sd = sd,M1_M8_se = se,M1_M8_ci = ci) -> complete_df
  # Save as a dataframe
  assign(paste0(variable_names[i],"_df"), complete_df)  
}
## Joining, by = "sampling.day"
## Joining, by = "sampling.day"
## Joining, by = "sampling.day"
## Joining, by = "sampling.day"
## Joining, by = "sampling.day"

Plots in a loop

There are a number of “ifelse” statements in this chunk of code which are used to adjust the plot margins in order to align the plots, as well as make the top and bottom (a and e) figures have x-axis labels.

variable_names <- c("T","S","NOx","SiO4","chla")
panel = c("(a)","(b)","(c)","(d)","(e)")
for (i in seq(1:length(variable_names))){
plot <- ggplot(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = M1))+
  geom_line(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = M1),color = "firebrick3",size = 1.25)+
  geom_line(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = M1_M8_mean),color = "grey50",size = 1)+
  geom_line(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = Pacific),color = "darkblue",size = 1.25)+
  geom_point(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = Pacific),color = "black",size = 1)+
  geom_ribbon(data = eval(parse(text = paste0(variable_names[i],"_df"))),aes(x = sampling.day,ymax = M1_M8_mean+M1_M8_se,ymin = M1_M8_mean-M1_M8_se),fill = "grey50",alpha = 0.5)+
  geom_point(data = subset(eval(parse(text = paste0(variable_names[i],"_df"))),sampling.day %in% c(1,8,15,24,32,36,42,48)),aes(x = sampling.day, y = Pacific), color = "darkblue", shape = 21, fill = "white", size = 3.5, stroke = 2.5)+
  geom_point(data = subset(eval(parse(text = paste0(variable_names[i],"_df"))),sampling.day %in% c(1,8,15,24,32,36,42,48)),aes(x = sampling.day, y = M1), color = "firebrick3", shape = 21, fill = "white", size = 3.5, stroke = 2.5)+
# Add points for sampling points
  geom_point(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = M1),color = "black",size = 1.5)+
  geom_point(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = Pacific),color = "black",size = 1.5)+
  xlab("Day of Experiment")+
  # Three different themes: for top panel, middle three panels, and bottom panel
    theme(panel.background = element_rect(fill="white"),
        panel.border = element_rect(colour = "black", fill=NA, size=2),
        axis.text.y = element_text(size = 15,face = "bold"),
        axis.text.x.bottom = element_text(size = ifelse(panel[i] %in% c("(e)"),20,0),face = "bold"),
        axis.text.x.top = element_text(size = ifelse(panel[i] %in% c("(a)"),20,0),face = "bold"),
        axis.title.x.bottom = element_text(size = ifelse(panel[i] %in% c("(e)"),25,0), face = "bold"),
        axis.title.x.top = element_text(size = ifelse(panel[i] %in% c("(a)"),25,0), face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold",margin = margin(0, ifelse(panel[i] %in% c("(b)"), 1.7, 
                                                                                       ifelse(panel[i] %in% c("(a)"), 11.75, 11.5)), 0,0)),
        axis.ticks.length.x.bottom = unit(0.25,"cm"),
        axis.ticks.length.x.top = unit(ifelse(panel[i] %in% c("(a)"),0.25,0),"cm"),
        plot.margin = unit(c(ifelse(panel[i] == "(a)",0,5),0,0,3), "pt"),
        legend.position = "none")+
  ylim(ifelse(panel[i] %in% c("(c)","(d)","(e)"),0,min(eval(parse(text = paste0(variable_names[i],"_df")))$Pacific)),NA)+
  # Add in secondary x axis and labels for temperature plot
    scale_x_continuous(sec.axis = sec_axis(~ .,name = "Sample Number", breaks = c(1,8,15,24,32,36,42,48),labels = c(1,2,3,4,5,6,7,8)),limits = c(1,50))+
    # geom_text(data = subset(eval(parse(text = paste0(variable_names[i],"_df"))), sampling.day %in% c(1,8,15,24,32,36,42,48)), aes(x = sampling.day, y = 21.28, label = sample_number), size = ifelse(panel[i] %in% c("(a)"),7,0), color = "black")+
    annotate("text",  x=-Inf, y = Inf, label = paste0(panel[i]), vjust=1.5, hjust=-0.3,size = 12)+
  # add vertical lines for omz water addition
    geom_vline(xintercept = 11, lty = 2, color = "seagreen", size = 1)+
    geom_vline(xintercept = 12, lty = 2, color = "seagreen", size = 1)+
  # add vertical lines for NaCl brine addition
    geom_vline(xintercept = 13, lty = 2, color = "#ff7f00", size = 1)+
    geom_vline(xintercept = 33, lty = 2, color = "#ff7f00", size = 1)+
  # Use an ifelse statement to change the y labels
  ylab(ifelse(panel[i] == "(a)",expression(bold(~"Temperature" ~ ("°C"))),
              ifelse(panel[i] == "(b)",expression(bold(~"Salinity" ~ ("psu"))),
                     ifelse(panel[i] == "(c)",expression(bold(~"NO"[2] ~ + ~"NO"[3] ~ (mu*"mol L" ^-1)~"\n")),
                            ifelse(panel[i] == "(d)",expression(bold(~"Si(OH)"[4] ~ (mu*"mol L" ^-1)~"\n")),
                                   ifelse(panel[i] == "(e)",expression(bold(~"chl-a" ~ (mu*"g L" ^-1) ~ "\n")),
              "oops you messed up"))))))

ggsave(here::here("figures", "biogeochemical_figure", paste0(variable_names[i],"_gg.png")) ,plot,width = 7, height = ifelse(panel[i] %in% c("(a)","(e)"),3.5,3))

# show plot
plot
}
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

Make the legend

legend_gg <- ggplot(data = T_long, aes(x = Day, y = value, color = variable))+
  xlim(30,38)+
  ylim(20.23,20.67)+
    theme(panel.background = element_rect(fill="white"),
        panel.border = element_rect(colour = "black", fill=NA, size=2),
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(), 
        legend.position = "none",
        plot.margin = unit(c(0,0,0,58),"pt"))+
  annotate("segment", x = 30, xend = 31, y = 20.55, yend = 20.55, color = "firebrick3",size = 1.5)+ #mesocosm
  annotate("segment", x = 30, xend = 31, y = 20.35, yend = 20.35, color = "darkblue", size = 1.5)+ #pacific
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 1.5)+ #M1-M8 average
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 5,alpha = 0.5)+ #M1-M8 average SE
  annotate("point", x = 35, y = 20.35, shape = 21, colour = "black", fill = "white", size = 7, stroke = 3)+ # eDNA sample
  annotate("point", x = 35, y = 20.35, colour = "black", size = 1.5)+ # eDNA sample
  annotate("text",label = "Mesocosm 1 (M1)", x = 31.5, y = 20.55,size = 6,adj = 0)+ #mesocosm
  annotate("text",label = "Pacific", x = 31.5, y = 20.35, size = 6, adj = 0)+ #pacific
  annotate("text",label = "Mean M1-M8", x = 36, y = 20.55, size = 6, adj = 0)+ #M1-M8 average
  annotate("text",label = "eDNA Sample", x = 36, y = 20.35, size = 6, adj = 0) #eDNA sample

legend_gg

ggsave("legend_gg.png", width = 7, height = 1,path = here::here("figures"))

# Re-make, but with OMZ addition and NaCl brine additions in legend

legend_gg <- ggplot(data = T_long, aes(x = Day, y = value, color = variable))+
  xlim(30,42.5)+
  ylim(20.23,20.67)+
    theme(panel.background = element_rect(fill="white"),
        panel.border = element_rect(colour = "black", fill=NA, size=2),
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(), 
        legend.position = "none",
        plot.margin = unit(c(0,0,0,58),"pt"))+
  annotate("segment", x = 30, xend = 31, y = 20.55, yend = 20.55, color = "firebrick3",size = 1.5)+ #mesocosm
  annotate("segment", x = 30, xend = 31, y = 20.35, yend = 20.35, color = "darkblue", size = 1.5)+ #pacific
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 1.5)+ #M1-M8 average
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 5,alpha = 0.5)+ #M1-M8 average SE
  annotate("point", x = 35, y = 20.35, shape = 21, colour = "black", fill = "white", size = 7, stroke = 3)+ # eDNA sample
  annotate("point", x = 35, y = 20.35, colour = "black", size = 1.5)+ # eDNA sample
  annotate("text",label = "Mesocosm 1 (M1)", x = 31.5, y = 20.55,size = 6,adj = 0)+ #mesocosm
  annotate("text",label = "Pacific", x = 31.5, y = 20.35, size = 6, adj = 0)+ #pacific
  annotate("text",label = "Mean M1-M8", x = 36, y = 20.55, size = 6, adj = 0)+ #M1-M8 average
  annotate("text",label = "eDNA Sample", x = 36, y = 20.35, size = 6, adj = 0)+ #eDNA sample
# Add OMZ and NaCl brine additions to legend
  annotate("segment", x = 38.5, xend = 39.8, y = 20.55, yend = 20.55, color = "seagreen", size = 1.5,lty = 2)+ #OMZ water addition
  annotate("text", x = 40, y = 20.55, label = "OMZ water addition", size = 6, adj = 0)+ #OMZ water addition
  annotate("segment", x = 38.5, xend = 39.8, y = 20.35, yend = 20.35, color = "#ff7f00", size = 1.5, lty = 2)+ #NaCl brine addition
  annotate("text", x = 40,  y = 20.35, label = "NaCl brine addition", size = 6, adj = 0) #NaCl brine addition
  
legend_gg

ggsave("legend_final.png", width = 11, height = 1,path = here::here("figures", "biogeochemical_figure"))

Flow data (Figure 2, second column)

Import data

# FCM_df <- read.csv("/Users/mmin/Documents/KOSMOS_data_local/Worden_lab_data/FCM/KOSMOS2017_FCM_data_final_version_MM_edits.csv")
FCM_df <- read.csv(here::here("data", "Worden_lab_data", "FCM", "KOSMOS2017_FCM_data_final_version_MM_edits.csv"))
# Convert to long format (to achieve consistency with biogeochemical data format)
# Gather the five columns that we will plot
gathercols <- c("Synechococcus", "total_Eukaryotes", "cryptophytes", "heterotrophic.bacteria", "small_bacterial_population")
FCM_df_gather <- gather(dplyr::select(FCM_df,-c(Sample,Prochlorococcus,notes)), key = "parameter", value = "value", gathercols, factor_key=TRUE)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(gathercols)` instead of `gathercols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
colnames(FCM_df_gather) <- c("mesocosm","sampling.day","parameter","value")

# Divide by 10000, so all values are in cells/ml x 10^4 (see Worden et al. 2004 for precedent)
FCM_df_gather$value <- FCM_df_gather$value/10000

# Take out only M1 and Pacific, drop DW
FCM_df_gather <- subset(FCM_df_gather, mesocosm %in% c("Pacific","Mesocosm 1"))

Prep individual dataframes from the complete dataset

FCM_df_gather$mesocosm <- as.character(FCM_df_gather$mesocosm)
FCM_df_gather$parameter <- as.character(FCM_df_gather$parameter)
syn_long <- subset(FCM_df_gather,parameter == "Synechococcus")
euks_long <- subset(FCM_df_gather,parameter == "total_Eukaryotes")
cryp_long <- subset(FCM_df_gather,parameter == "cryptophytes")
het_bact_long <- subset(FCM_df_gather,parameter == "heterotrophic.bacteria")
small_bact_long <- subset(FCM_df_gather,parameter == "small_bacterial_population")

Prepare data for ggplot in a for loop

variable_names <- c("syn","euks","cryp","het_bact","small_bact")

for (i in seq(1:length(variable_names))){
  df <- pivot_wider(eval(parse(text = paste0(variable_names[i],"_long"))),names_from = mesocosm, values_from = value)
  # Remove first sampling point, because Alex and Bente don't trust it (issue with water handling or sampling)
  df <- subset(df, sampling.day != 2)
  colnames(df) <- c("sampling.day","parameter","Pacific","M1")
  df$Pacific <- as.numeric(df$Pacific)
  df$M1 <- as.numeric(df$M1)
  assign(paste0(variable_names[i],"_df"), df)  
}                                                

euks_df
## # A tibble: 22 × 4
##    sampling.day parameter        Pacific    M1
##           <int> <chr>              <dbl> <dbl>
##  1            4 total_Eukaryotes   0.486 0.586
##  2            6 total_Eukaryotes   0.434 0.707
##  3            8 total_Eukaryotes   0.520 0.442
##  4           10 total_Eukaryotes   0.491 0.170
##  5           12 total_Eukaryotes   0.500 0.265
##  6           15 total_Eukaryotes   1.25  0.514
##  7           16 total_Eukaryotes   0.903 0.688
##  8           17 total_Eukaryotes   0.763 0.613
##  9           22 total_Eukaryotes   0.296 0.529
## 10           26 total_Eukaryotes   0.727 0.91 
## # … with 12 more rows

Plots in a loop

variable_names <- c("syn","euks","cryp","het_bact","small_bact")
panel = c("(f)","(g)","(h)","(i)","(j)")
for (i in seq(1:length(variable_names))){
  df = eval(parse(text = paste0(variable_names[i],"_df")))
  plot <- ggplot(data = df[!is.na(df$M1),], aes(x = sampling.day, y = M1))+
    geom_line(data = df[!is.na(df$M1),], aes(x = sampling.day, y = M1),color = "firebrick3",size = 1.25)+
    geom_line(data = df[!is.na(df$Pacific),], aes(x = sampling.day, y = Pacific),color = "darkblue",size = 1.25)+
    geom_point(data = subset(df,sampling.day %in% c(1,8,15,24,32,36,42,48)),aes(x = sampling.day, y = Pacific), color = "darkblue", shape = 21, fill = "white", size = 3.5, stroke = 2.5)+
    geom_point(data = subset(df,sampling.day %in% c(1,8,15,24,32,36,42,48)),aes(x = sampling.day, y = M1), color = "firebrick3", shape = 21, fill = "white", size = 3.5, stroke = 2.5)+
# Add points for sampling points
  geom_point(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = M1),color = "black",size = 1.5)+
  geom_point(data = eval(parse(text = paste0(variable_names[i],"_df"))), aes(x = sampling.day, y = Pacific),color = "black",size = 1.5)+
    xlab("Day of Experiment")+
    # Three different themes: for top panel, middle three panels, and bottom panel
    theme(panel.background = element_rect(fill="white"),
          panel.border = element_rect(colour = "black", fill=NA, size=2),
          axis.text.y = element_text(size = 15,face = "bold"),
          axis.text.x.bottom = element_text(size = ifelse(panel[i] %in% c("(j)"),20,0),face = "bold"),
          axis.text.x.top = element_text(size = ifelse(panel[i] %in% c("(f)"),20,0),face = "bold"),
          axis.title.x.bottom = element_text(size = ifelse(panel[i] %in% c("(j)"),25,0), face = "bold"),
          axis.title.x.top = element_text(size = ifelse(panel[i] %in% c("(f)"),25,0), face = "bold"),
          # axis.title.y = element_text(size = 18, face = "bold"),
          axis.ticks.length.x.bottom = unit(0.25,"cm"),
          axis.ticks.length.x.top = unit(ifelse(panel[i] %in% c("(f)"),0.25,0),"cm"),
          plot.margin = unit(c(ifelse(panel[i] == "(f)",0,5),0,0,3), "pt"),
          # plot.margin = unit(c(ifelse(panel[i] == "(f)",0,5),0,0,ifelse(panel[i] %in% c("(f)"),12.5,
          #                                                               ifelse(panel[i] %in% c("(g)"),21,
          #                                                                      ifelse(panel[i] %in% c("(h)"),0,
          #                                                                           ifelse(panel[i] %in% c("(i)"),5,
          #                                                                                 ifelse(panel[i] %in% c("(j)"),7.5,999999)))))), "pt"),
          axis.title.y = element_text(size = 18, face = "bold",margin = margin(t = ifelse(panel[i] == "(f)",0,5),r = ifelse(panel[i] %in% c("(f)"),12.5,
                                                                        ifelse(panel[i] %in% c("(g)"),21,
                                                                               ifelse(panel[i] %in% c("(h)"),0,
                                                                                    ifelse(panel[i] %in% c("(i)"),5,
                                                                                          ifelse(panel[i] %in% c("(j)"),7.5,999999))))),b =0,l =0, "pt")),
          legend.position = "none")+
    xlim(1,50)+
    # ylim(ifelse(panel[i] %in% c("(h)","(i)","(j)"),0,min(df$Pacific)),NA)+
    ylim(0,ifelse(panel[i] %in% c("(f)"),45,
                 ifelse(panel[i] %in% c("(i)"),750, NA)))+
    # Add in secondary x axis and labels for temperature plot
    scale_x_continuous(sec.axis = sec_axis(~ .,name = "Sample Number", breaks = c(1,8,15,24,32,36,42,48),labels = c(1,2,3,4,5,6,7,8)),limits = c(1,50))+
    # geom_text(data = subset(df, sampling.day %in% c(1,8,15,24,32,36,42,48)), aes(x = sampling.day, y = 21.28, label = sample_number), size = ifelse(panel[i] %in% c("(f)"),7,0), color = "black")+
    annotate("text",  x=-Inf, y = Inf, label = paste0(panel[i]), vjust=1.5, hjust=-0.3,size = 12)+
    # add vertical lines for omz water addition
    geom_vline(xintercept = 11, lty = 2, color = "seagreen", size = 1)+
    geom_vline(xintercept = 12, lty = 2, color = "seagreen", size = 1)+
    # add vertical lines for NaCl brine addition
    geom_vline(xintercept = 13, lty = 2, color = "#ff7f00", size = 1)+
    geom_vline(xintercept = 33, lty = 2, color = "#ff7f00", size = 1)+
    # Use an ifelse statement to change the y labels
    ylab(ifelse(panel[i] == "(f)",expression(bolditalic(~"Synechococcus")),
                ifelse(panel[i] == "(g)",expression(bold(~"Total eukaryotes")),
                       ifelse(panel[i] == "(h)",expression(bold(~"Cryptophytes")),
                              ifelse(panel[i] == "(i)",expression(bold(~"Heterotrophic bacteria")),
                                     ifelse(panel[i] == "(j)",expression(bold(~"Small bacteria")),
                                            "oops you messed up"))))))
  
  ggsave(here::here("figures", "biogeochemical_figure", paste0(variable_names[i],"_gg.png")),plot,width = 7, height = ifelse(panel[i] %in% c("(f)","(j)"),3.5,3))
}
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).
# Make a "cells/ml" label
cells_ml_legend_gg <- ggplot(data = syn_df, aes(x = sampling.day, y = M1))+
         geom_point()+
         ylab(expression(bold(~"Cells ml" ^-1 ~ ("x 10"^4) ~ "\n")))+
  theme(axis.title.y = element_text(size = 30))

ggsave(here::here("figures", "biogeochemical_figure", "cells_ml_legend.png"), cells_ml_legend_gg,width = 7,height = 7)

Make the legend

legend_gg <- ggplot(data = T_long, aes(x = Day, y = value, color = variable))+
  xlim(30,38)+
  ylim(20.23,20.67)+
    theme(panel.background = element_rect(fill="white"),
        panel.border = element_rect(colour = "black", fill=NA, size=2),
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(), 
        legend.position = "none",
        plot.margin = unit(c(0,0,0,58),"pt"))+
  annotate("segment", x = 30, xend = 31, y = 20.55, yend = 20.55, color = "firebrick3",size = 1.5)+ #mesocosm
  annotate("segment", x = 30, xend = 31, y = 20.35, yend = 20.35, color = "darkblue", size = 1.5)+ #pacific
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 1.5)+ #M1-M8 average
  annotate("segment", x = 34.5, xend = 35.5, y = 20.55, yend = 20.55, color = "grey50", size = 5,alpha = 0.5)+ #M1-M8 average SE
  annotate("point", x = 35, y = 20.35, shape = 21, colour = "black", fill = "white", size = 7, stroke = 3)+ # eDNA sample
  annotate("text",label = "Mesocosm 1 (M1)", x = 31.5, y = 20.55,size = 6,adj = 0)+ #mesocosm
  annotate("text",label = "Pacific", x = 31.5, y = 20.35, size = 6, adj = 0)+ #pacific
  annotate("text",label = "Mean M1-M8", x = 36, y = 20.55, size = 6, adj = 0)+ #M1-M8 average
  annotate("text",label = "eDNA Sample", x = 36, y = 20.35, size = 6, adj = 0) #eDNA sample

legend_gg

ggsave("legend_gg.png", width = 7, height = 1,path = here::here("figures", "biogeochemical_figure"))

Read in eDNA data (COI, 18S, 12S, 16S)

We will be reading in the master datasets from MBON and subsetting out only the KOSMOS project.

Please note: there are some code chunks that have “eval = FALSE” because they are used to subset the master datasets, which are too large to upload to GitHub.

COI

Read in metadata, make edits

metadata <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/COI/for_MBON/decontaminated/072720/COI_master_metadata.csv")
metadata$sample_name <- as.character(metadata$sample_name)

## Depth
# Change depth to character so that the mutate line will work
metadata$depth <- as.character(metadata$depth)

# Change surface to 1 m, bottom to 20 m for Florida samples
metadata %>% mutate(.,depth = ifelse(depth %in% c("Surface","SUR"),"1",
                                     ifelse(depth %in% c("Bottom","BOT"),"20",depth))) -> metadata

metadata$depth <- as.numeric(as.character(metadata$depth))

# Flip the depths for the 67-60 samples in the C3PO19 cruise
metadata$depth <- as.numeric(as.character(metadata$depth))
metadata %>% mutate(.,depth = ifelse(metadata$sample_name %in% c("C3PO19c28_12_eDNA_GG"),150,
                                            ifelse(metadata$sample_name %in% c("C3PO19c28_9_eDNA_GG"),100,
                                                   ifelse(metadata$sample_name %in% c("C3PO19c28_6_eDNA_GG"),60,
                                                          ifelse(metadata$sample_name %in% c("C3PO19c28_4_eDNA_GG"),20,
                                                                 ifelse(metadata$sample_name %in% c("C3PO19c28_3_eDNA_GG"),0,depth)))))) -> metadata

# Change depth from factor to numeric
metadata$depth <- as.numeric(as.character(metadata$depth))


## Add in the depth metadata for the Hawaii + Aku samples - in email from Kris Walz on 4/24/20
metadata %>% mutate(depth = ifelse(sample_name == "HI300_J", 300,
                    ifelse(sample_name == "HI650_J", 650,
                    ifelse(sample_name == "HI900_J", 900, depth)))) -> metadata
# AKU samples

## Sampling device
netfilts <- grepl('netfilt|bongo|Bongo', metadata$sample_name)
netfilt_samples <- subset(metadata, metadata$sample_name %in% metadata$sample_name[netfilts])
netfilt_sample_names <- netfilt_samples$sample_name

metadata_updated <- metadata %>%
  mutate(.,sampling_device = ifelse(sample_type %in% c("positive","negative","blank"),"CONTROL",
    ifelse(sample_name %in% netfilt_sample_names,"Net filtrate",
    ifelse(SAMPLING_platform %in% c("WESTERN FLYER", "RACHEL CARSON", "FULMAR", "WESTERNFLYER"," WESTERN FLYER", "POINT LOBOS", "Point Lobos",
"Point lobos",  "Point Sur",  "McArthur II", "JohnMartin","JOHN MARTIN","POINT SUR","rachel carson","point lobos","McArthur II", "Western Flyer", "Fulmar",  "Rachel Carson", "Rachel carson","CPF","POINT 0"),"Shipboard Niskin",
    ifelse(SAMPLING_platform %in% c("REUBEN LASKER"),"Net filtrate",
    ifelse(SAMPLING_platform %in% c("DocRicketts"),"ROV",
    ifelse(SAMPLING_platform %in% c("Mesocosm"),"Mesocosm",
    ifelse(SAMPLING_platform %in% c("Tethys","MAKAI","DORADO","AKU","Dorado389"),"3G ESP",
    ifelse(samp_collection_device %in% c("CTD/SBE911 Rosette with Niskin bottle"),"Shipboard Niskin",
    ifelse(SAMPLE_Collection_Device %in% c("CTD/SBE911 Rosette with Niskin bottle","CTD/SBE911_Rosette_with_Niskin_bottle"),"Shipboard Niskin",
    ifelse(samp_collection_device %in% c("SCUBA diver/bottle"),"SCUBA diver",
    ifelse(samp_collection_device %in% c("Methot_net"),"Net filtrate",
    ifelse(sample_name %in% c("CN18Fc27_bongo__GG","CN18Fc38_bongo__GG","CN18Fc43_bongo_GG"),"Net filtrate",
      metadata$samp_collection_device)))))))))))))

#metadata_updated$sampling_device

## Project
metadata_updated$SAMPLING_cruise <- as.character(metadata_updated$SAMPLING_cruise)
metadata_updated$SAMPLING_campaign <- as.character(metadata_updated$SAMPLING_campaign)
metadata_updated$description <- as.character(metadata_updated$description)
metadata_updated$project_name <- as.character(metadata_updated$project_name)

metadata_updated %>% mutate(.,project_new = ifelse(SAMPLING_cruise %in% c("CANON11","CANON13","CANON15","CANON16","CANON17S","CN18S","CN18F","C3PO19","CN19S","Lasker2018","Flyer2018"),SAMPLING_cruise,
                                      ifelse(!(description %in% c("")),description,
                                      ifelse(!(SAMPLING_campaign %in% c("")), SAMPLING_campaign,
                                      project_name)))) -> metadata_updated


## Location
metadata_updated$geo_loc_name <- as.character(metadata_updated$geo_loc_name)

## Depth as categorical
#sort(metadata_updated$depth)
metadata_updated %>% mutate(.,depth_cat = ifelse(metadata_updated$depth >= 0 & metadata_updated$depth <= 5,"0-5",
            ifelse(metadata_updated$depth > 5 & metadata_updated$depth < 20,"6-19",
            ifelse(metadata_updated$depth >= 20 & metadata_updated$depth < 29.5,"20-29",
            ifelse(metadata_updated$depth >= 29.5 & metadata_updated$depth < 40,"30-39",
            ifelse(metadata_updated$depth >= 40 & metadata_updated$depth < 60,"40-59",
            ifelse(metadata_updated$depth >= 60 & metadata_updated$depth <= 80,"60-80",
            ifelse(metadata_updated$depth >= 100 & metadata_updated$depth <= 199,"100-199",
            ifelse(metadata_updated$depth > 199 & metadata_updated$depth <= 250,"200-250",
            ifelse(metadata_updated$depth >= 280 & metadata_updated$depth <= 500,"280-500",
            ifelse(metadata_updated$SAMPLING_platform %in% c("Mesocosm"),"0-5",
            ifelse(is.na(metadata_updated$depth),"unknown",
            "600-1565")))))))))))) -> metadata_updated 

metadata_updated$depth_cat <- factor(metadata_updated$depth_cat,levels = c("0-5","6-19","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-1565"))

metadata_updated <- metadata_updated

# Use lubridate to convert date/time to posixct
metadata_updated$SAMPLING_date_time <- as.character(metadata_updated$SAMPLING_date_time)
metadata_updated %>% mutate(.,date_time_updated = parse_date_time(SAMPLING_date_time,c("ymdHM","mdyHM","ymdHMS"))) -> metadata_updated

class(metadata_updated$date_time_updated)
# Let's add in a month field, so we can try to visualize seasonality
metadata_updated %>% mutate(.,month_updated = month(date_time_updated,label = TRUE, abbr = FALSE)) -> metadata_updated
metadata_updated$month_updated <- as.character(metadata_updated$month_updated)
metadata_updated %>% mutate(.,month_updated = ifelse(!(is.na(month_updated)),month_updated,
                                      ifelse(month %in% c("Jan"),"January",
                                      ifelse(month %in% c("Feb"),"February",
                                      ifelse(Month == "March" | month %in% c("Mar"),"March",
                                      ifelse(month %in% c("Apr"),"April",
                                      ifelse(Month == "May" | month %in% c("May"),"May",
                                      ifelse(month %in% c("Jun"),"June",
                                      ifelse(month %in% c("Jul"),"July",
                                      ifelse(month %in% c("Aug"),"August",
                                      ifelse(month %in% c("Sep"),"September",
                                      ifelse(month %in% c("Oct"),"October",
                                      ifelse(month %in% c("Nov"),"November",
                                      ifelse(month %in% c("Dec"),"December",month_updated))))))))))))))-> metadata_updated


# Let's add a season field
metadata_updated %>% mutate(.,season = ifelse(month_updated %in% c("November","December","January"),"Winter",
                                              ifelse(month_updated %in% c("February","March","April"),"Spring",
                                                     ifelse(month_updated %in% c("May","June","July"),"Summer",
                                                            ifelse(month_updated %in% c("August","September","October"),"Fall",month_updated))))) -> metadata_updated
metadata_updated$season <- factor(metadata_updated$season, levels = c("Winter","Spring","Summer","Fall"))

# Let's add a different season field - using the "conventional" seasons
metadata_updated %>% mutate(.,season_conv = ifelse(month_updated %in% c("January", "February", "March"),"Winter",
                                              ifelse(month_updated %in% c("April", "May", "June"),"Spring",
                                                     ifelse(month_updated %in% c("July", "August", "September"),"Summer",
                                                            ifelse(month_updated %in% c("October", "November", "December"),"Fall",month_updated))))) -> metadata_updated
metadata_updated$season_conv <- factor(metadata_updated$season_conv, levels = c("Winter","Spring","Summer","Fall"))

metadata_updated$month_updated <- factor(metadata_updated$month_updated, levels = c("January", "February", "March", "April", "May", "June","July","August","September","October","November","December"))

# Let's add a year field
metadata_updated %>% mutate(.,year_updated = year(date_time_updated)) -> metadata_updated

Drop samples based on metadata attributes

# Drop 67-70 for the C3PO19 cruise - this is the cast where we the filters got mixed up
metadata_updated <- subset(metadata_updated, !(SAMPLING_station %in% c("67-70") & SAMPLING_cruise == "C3PO19"))

# Take out all non-environmental samples
metadata_updated <- subset(metadata_updated, sample_type %in% c("environmental"))

# Take out the non-eDNA samples from the Florida zooplankton study
metadata_updated <- subset(metadata_updated, !(type_sample %in% c("pre_filt","tissue","extr_blank","filt_blank")))

# Drop netfilt samples
metadata_updated <- subset(metadata_updated, !(sample_name %in% netfilt_sample_names))

# Move sample names to rownames for compatibility with DEICODE
metadata_updated <- metadata_updated[!(is.na(metadata_updated$sample_name)),]
rownames(metadata_updated) <- NULL
metadata_updated <- column_to_rownames(metadata_updated,"sample_name")

Fix asv table so that there aren’t any leading Xs in the column names

asv_table <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/COI/for_MBON/decontaminated/072720/COI_master_ASV_table.csv",row.names = 1, check.names = FALSE)
destroyX = function(es) {
  f = es
  for (col in c(1:ncol(f))){ #for each column in dataframe
    if (startsWith(colnames(f)[col], "X") == TRUE)  { #if starts with 'X' ..
      colnames(f)[col] <- substr(colnames(f)[col], 2, 100) #get rid of it
    }
  }
  assign(deparse(substitute(es)), f, inherits = TRUE) #assign corrected data to original name
}
asv_table <- destroyX(asv_table)

Load master asv_table, metadata, and tax_table objects

asv_table <- otu_table(asv_table,taxa_are_rows = TRUE)
sample_data <- sample_data(metadata_updated)
tax_table <- tax_table(as.matrix(read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/COI/for_MBON/decontaminated/072720/COI_master_tax_table.csv",row.names = 1)))
master_phyloseq_COI <- merge_phyloseq(asv_table,sample_data,tax_table)

Remove terrestrial contaminants

## Decontaminate
# no humans
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Family != "Hominidae"
 )
# no cows
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Family != "Bovidae"
 )
# no pigs
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Family != "Suidae"
 )
# no dogs
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Family != "Canidae"
 )
# no chickens
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Genus != "Gallus"
 )
# no turkeys
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Genus != "Meleagris"
 )
# no cats
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Family != "Felidae"
 )
# no rats!! and other rodents
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Order != "Rodentia"
 )
# no rabbits
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Order != "Lagomorpha"
 )

# no insects
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Class != "Insecta"
 )

# no arachnids
master_phyloseq_COI <- master_phyloseq_COI %>%
 subset_taxa(
   Class != "Arachnida"
 )

master_phyloseq_COI <- prune_taxa(taxa_sums(master_phyloseq_COI) > 0,master_phyloseq_COI)
master_phyloseq_COI

Subset KOSMOS samples and export data

# Subset out KOSMOS samples
kosmos_COI_phyloseq <- prune_samples(sample_data(master_phyloseq_COI)$SAMPLING_project %in% c("KOSMOS"),master_phyloseq_COI)
kosmos_COI_phyloseq <- prune_taxa(taxa_sums(kosmos_COI_phyloseq)>0,kosmos_COI_phyloseq)

# kosmos_COI <- left_join(kosmos_COI_asv_table, kosmos_COI_tax_table, by = "ASV")
# write.csv(kosmos_COI, "/Users/mmin/Documents/KOSMOS_data_local/kosmos_COI_061620.csv")

# EXPORT FILES FOR UPLOAD TO GITHUB
# ASV table
kosmos_COI_asv_table <- as.data.frame(as(otu_table(kosmos_COI_phyloseq),"matrix"))
rownames_to_column(kosmos_COI_asv_table,"ASV") -> kosmos_COI_asv_table
# tax table
kosmos_COI_tax_table <- as.data.frame(as(tax_table(kosmos_COI_phyloseq),"matrix"))
rownames_to_column(kosmos_COI_tax_table,"ASV") -> kosmos_COI_tax_table
# metadata
kosmos_COI_metadata <- as.data.frame(as.matrix(sample_data(kosmos_COI_phyloseq)))
rownames_to_column(kosmos_COI_metadata,"SampleID") -> kosmos_COI_metadata

# export three files
write.csv(kosmos_COI_asv_table, here::here("data", "kosmos_eDNA_data", "kosmos_COI_asv_table.csv"), row.names = FALSE)
write.csv(kosmos_COI_tax_table, here::here("data", "kosmos_eDNA_data", "kosmos_COI_tax_table.csv"), row.names = FALSE)
write.csv(kosmos_COI_metadata, here::here("data", "kosmos_eDNA_data", "kosmos_COI_metadata.csv"), row.names = FALSE)

load KOSMOS data

# read data
kosmos_COI_asv_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_COI_asv_table.csv"), row.names = 1)
kosmos_COI_tax_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_COI_tax_table.csv"), row.names = 1)
kosmos_COI_metadata <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_COI_metadata.csv"), row.names = 1)


# reformat and join as phyloseq object
asv_table <- otu_table(kosmos_COI_asv_table,taxa_are_rows = TRUE)
sample_data <- sample_data(kosmos_COI_metadata)
tax_table <- tax_table(as.matrix(kosmos_COI_tax_table))
kosmos_COI_phyloseq <- merge_phyloseq(asv_table,sample_data,tax_table)

# Subset only environmental samples (non-controls)
kosmos_COI_16_phyloseq <- prune_samples(sample_data(kosmos_COI_phyloseq)$SAMPLING_station_number %in% c("M1","MP"), kosmos_COI_phyloseq)
kosmos_COI_16_phyloseq <- prune_taxa(taxa_sums(kosmos_COI_16_phyloseq)>0,kosmos_COI_16_phyloseq)

18S

Read in metadata, make edits

metadata <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/18S/for_MBON/decontaminated/072720/18S_master_metadata.csv")
metadata$sample_name <- as.character(metadata$sample_name)

## Depth
# Change depth to character so that the mutate line will work
metadata$depth <- as.character(metadata$depth)

# Change surface to 1 m, bottom to 20 m for Florida samples
metadata %>% mutate(.,depth = ifelse(depth %in% c("Surface","SUR"),"1",
                                     ifelse(depth %in% c("Bottom","BOT"),"20",depth))) -> metadata

metadata$depth <- as.numeric(as.character(metadata$depth))

# Flip the depths for the 67-60 samples in the C3PO19 cruise
metadata$depth <- as.numeric(as.character(metadata$depth))
metadata %>% mutate(.,depth = ifelse(metadata$sample_name %in% c("C3PO19c28_12_eDNA_HH"),150,
                                            ifelse(metadata$sample_name %in% c("C3PO19c28_9_eDNA_HH"),100,
                                                   ifelse(metadata$sample_name %in% c("C3PO19c28_6_eDNA_HH"),60,
                                                          ifelse(metadata$sample_name %in% c("C3PO19c28_4_eDNA_HH"),20,
                                                                 ifelse(metadata$sample_name %in% c("C3PO19c28_3_eDNA_HH"),0,depth)))))) -> metadata

## Sampling device
netfilts <- grepl('netfilt|bongo|Bongo', metadata$sample_name)
netfilt_samples <- metadata$sample_name[netfilts]

metadata_updated <- metadata %>%
  mutate(.,sampling_device = ifelse(sample_type %in% c("positive","negative","blank"),"CONTROL",
    ifelse(sample_name %in% netfilt_samples,"Net filtrate",
    ifelse(SAMPLING_platform %in% c("WESTERN FLYER", "RACHEL CARSON", "FULMAR", "WESTERNFLYER"," WESTERN FLYER", "POINT LOBOS", "Point Lobos","Shimada",
"Point lobos",  "Point Sur",  "McArthur II", "JohnMartin","JOHN MARTIN","POINT SUR","rachel carson","point lobos","McArthur II", "Western Flyer", "Fulmar",  "Rachel Carson", "Rachel carson","CPF","POINT 0"),"Shipboard Niskin",
    ifelse(SAMPLING_platform %in% c("REUBEN LASKER"),"Net filtrate",
    ifelse(SAMPLING_platform %in% c("DocRicketts"),"ROV",
    ifelse(SAMPLING_platform %in% c("Mesocosm"),"Mesocosm",
    ifelse(depth %in% c("SUR","BOT"),"Shipboard Niskin",       
    ifelse(SAMPLING_platform %in% c("Tethys","MAKAI","DORADO","AKU","Dorado"),"3G ESP",
    ifelse(samp_collection_device %in% c("CTD/SBE911 Rosette with Niskin bottle"),"Shipboard Niskin",
    ifelse(SAMPLE_Collection_Device %in% c("CTD/SBE911 Rosette with Niskin bottle"),"Shipboard Niskin",
    ifelse(samp_collection_device %in% c("SCUBA diver/bottle"),"SCUBA diver",
    ifelse(samp_collection_device %in% c("Methot_net"),"Net filtrate",
    ifelse(sample_name %in% c("CN18Fc27_bongo_","CN18Fc38_bongo_","CN18Fc43_bongo"),"Net filtrate",
      metadata$samp_collection_device))))))))))))))

# Change depth to numeric
metadata_updated$depth <- as.numeric(as.character(metadata_updated$depth))

## Project
metadata_updated$SAMPLING_cruise <- as.character(metadata_updated$SAMPLING_cruise)
metadata_updated$SAMPLING_campaign <- as.character(metadata_updated$SAMPLING_campaign)
metadata_updated$description <- as.character(metadata_updated$description)
metadata_updated$project_name <- as.character(metadata_updated$project_name)
metadata_updated$PROJECT <- as.character(metadata_updated$PROJECT)

metadata_updated %>% mutate(.,project_new = ifelse(SAMPLING_cruise %in% c("CANON11","CANON13","CANON15","CANON16","CANON17S","CN18S","CN18F","C3PO19","CN19S","Lasker2018","Flyer2018"),SAMPLING_cruise,
                                      ifelse(!(description %in% c("")),description,
                                      ifelse(!(SAMPLING_campaign %in% c("")), SAMPLING_campaign,
                                      ifelse(!(PROJECT %in% c("")), PROJECT,
                                      project_name))))) -> metadata_updated


## Location
metadata_updated$geo_loc_NAme <- as.character(metadata_updated$geo_loc_NAme)
metadata_updated$geo_loc_name <- as.character(metadata_updated$geo_loc_name)
metadata_updated %>% mutate(.,geo_loc_name = ifelse(geo_loc_name %in% c(""),geo_loc_NAme,geo_loc_name)) -> metadata_updated
# Fix the FKNMS geo_loc_name
metadata_updated %>% mutate(.,geo_loc_name = ifelse(depth %in% c("SUR","BOT"),"USA:Florida:Keys",geo_loc_name)) -> metadata_updated
metadata_updated %>% mutate(.,geo_loc_name = ifelse(libraryID %in% c("M"),"USA:Florida:Keys",geo_loc_name)) -> metadata_updated

# Add in depth for DW addition sample from KOSMOS
metadata_updated["KOSMOSD15_DW2_UU",]$depth <- 30

# Create categorical depth field
metadata_updated %>% mutate(.,depth_cat = ifelse(metadata_updated$depth >= 0 & metadata_updated$depth <= 5.5,"0-5",
            ifelse(metadata_updated$depth > 5.5 & metadata_updated$depth <= 20,"6-20",
            ifelse(metadata_updated$depth > 20 & metadata_updated$depth < 29.5,"20-29",
            ifelse(metadata_updated$depth >= 29.5 & metadata_updated$depth < 40,"30-39",
            ifelse(metadata_updated$depth >= 40 & metadata_updated$depth < 60,"40-59",
            ifelse(metadata_updated$depth >= 60 & metadata_updated$depth <= 80,"60-80",
            ifelse(metadata_updated$depth >= 100 & metadata_updated$depth <= 199,"100-199",
            ifelse(metadata_updated$depth > 199 & metadata_updated$depth <= 250,"200-250",
            ifelse(metadata_updated$depth >= 280 & metadata_updated$depth <= 700,"280-700",
            ifelse(metadata_updated$SAMPLING_platform %in% c("Mesocosm"),"0-5",
            "600-700"))))))))))) -> metadata_updated 

# Use lubridate to convert date/time to posixct

# First consolidate all date/time information into one column
metadata_updated$collection_date <- as.character(metadata_updated$collection_date)
metadata_updated$SAMPLING_date_time <- as.character(metadata_updated$SAMPLING_date_time)
metadata_updated %>% mutate(., SAMPLING_date_time = ifelse(is.na(SAMPLING_date_time) | SAMPLING_date_time == "", collection_date, SAMPLING_date_time)) -> metadata_updated

metadata_updated %>% mutate(.,date_time_updated = parse_date_time(SAMPLING_date_time,c("ymdHM","mdyHM","ymdHMS"))) -> metadata_updated

# Let's add in a month field, so we can try to visualize seasonality
metadata_updated %>% mutate(.,month_updated = month(date_time_updated,label = TRUE, abbr = FALSE)) -> metadata_updated
metadata_updated$month_updated <- as.character(metadata_updated$month_updated)
# consolidate two month columns
metadata_updated$month <- as.character(metadata_updated$month)
metadata_updated$Month <- as.character(metadata_updated$Month)
metadata_updated %>% mutate(.,month = ifelse(is.na(month) | month == "",Month,month)) -> metadata_updated
metadata_updated %>% mutate(.,month_updated = ifelse(!(is.na(month_updated)),month_updated,
                                      ifelse(month %in% c("Jan"),"January",
                                      ifelse(month %in% c("Feb"),"February",
                                      ifelse(month %in% c("Mar", "March"),"March",
                                      ifelse(month %in% c("Apr"),"April",
                                      ifelse(month %in% c("May"),"May",
                                      ifelse(month %in% c("Jun"),"June",
                                      ifelse(month %in% c("Jul"),"July",
                                      ifelse(month %in% c("Aug"),"August",
                                      ifelse(month %in% c("Sep"),"September",
                                      ifelse(month %in% c("Oct"),"October",
                                      ifelse(month %in% c("Nov"),"November",
                                      ifelse(month %in% c("Dec"),"December",month_updated))))))))))))))-> metadata_updated
# Let's add a season field
metadata_updated %>% mutate(.,season = ifelse(month_updated %in% c("November","December","January"),"Winter",
                                              ifelse(month_updated %in% c("February","March","April"),"Spring",
                                                     ifelse(month_updated %in% c("May","June","July"),"Summer",
                                                            ifelse(month_updated %in% c("August","September","October"),"Fall",month_updated))))) -> metadata_updated
metadata_updated$season <- factor(metadata_updated$season, levels = c("Winter","Spring","Summer","Fall"))

# Let's add a different season field - using the "conventional" seasons
metadata_updated %>% mutate(.,season_conv = ifelse(month_updated %in% c("January", "February", "March"),"Winter",
                                              ifelse(month_updated %in% c("April", "May", "June"),"Spring",
                                                     ifelse(month_updated %in% c("July", "August", "September"),"Summer",
                                                            ifelse(month_updated %in% c("October", "November", "December"),"Fall",month_updated))))) -> metadata_updated
metadata_updated$season_conv <- factor(metadata_updated$season_conv, levels = c("Winter","Spring","Summer","Fall"))

metadata_updated$month_updated <- factor(metadata_updated$month_updated, levels = c("January", "February", "March", "April", "May", "June","July","August","September","October","November","December"))

# Fix the location for the FGNMS samples
metadata_updated %>% mutate(.,geo_loc_name = ifelse(metadata_updated$sample_name %in% c("MBON301_M","MBON302_M","MBON303_M","MBON304_M","MBON305_M","MBON306_M"),"USA:Texas:Flower Garden Banks",geo_loc_name)) -> metadata_updated

# Let's add a year field
metadata_updated %>% mutate(.,year_updated = year(date_time_updated)) -> metadata_updated

Drop samples based on metadata attributes

# Drop 67-70 for the C3PO19 cruise - this is the cast where the samples got mixed up
metadata_updated <- subset(metadata_updated, !(SAMPLING_station %in% c("67-70") & SAMPLING_cruise == "C3PO19"))

# Take out the non-eDNA samples from FK zooplankton study
metadata_updated <- subset(metadata_updated, !(type_sample %in% c("pre_filt","tissue","extr_blank","filt_blank")))
metadata_updated <- subset(metadata_updated, sample_type %in% c("environment","environmental"))

# Drop netfilt samples
metadata_updated <- subset(metadata_updated, !(sample_name %in% netfilt_sample_names))
metadata_updated <- subset(metadata_updated, !(sampling_device %in% c("Net filtrate")))

### Move sample names to rownames for export
metadata_updated <- metadata_updated[!(is.na(metadata_updated$sample_name)),]
rownames(metadata_updated) <- NULL
metadata_updated <- column_to_rownames(metadata_updated,"sample_name")

Fix asv table so that there aren’t any leading Xs in the column names

asv_table <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/18S/for_MBON/decontaminated/072720/18S_master_ASV_table.csv",row.names = 1, check.names = FALSE)
destroyX = function(es) {
  f = es
  for (col in c(1:ncol(f))){ #for each column in dataframe
    if (startsWith(colnames(f)[col], "X") == TRUE)  { #if starts with 'X' ..
      colnames(f)[col] <- substr(colnames(f)[col], 2, 100) #get rid of it
    }
  }
  assign(deparse(substitute(es)), f, inherits = TRUE) #assign corrected data to original name
}
asv_table <- destroyX(asv_table)

Make edits to tax table

tax_table_df <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/18S/for_MBON/decontaminated/072720/18S_master_tax_table.csv")
tax_table_df$Kingdom <- as.character(tax_table_df$Kingdom)
tax_table_df$Phylum <- as.character(tax_table_df$Phylum)
tax_table_df$Class <- as.character(tax_table_df$Class)
tax_table_df$Order <- as.character(tax_table_df$Order)
tax_table_df$Family <- as.character(tax_table_df$Family)
tax_table_df$Genus <- as.character(tax_table_df$Genus)
tax_table_df$Species <- as.character(tax_table_df$Species)

# Mutate Class
tax_table_df %>% mutate(.,Class = ifelse(tax_table_df$Species %in% c("Acantharian sp. 6201"),"Acantharea",
                                  ifelse(tax_table_df$Order %in% c("Diplonemea"), "Diplonemea",
                                  ifelse(tax_table_df$Order %in% c("Choanoflagellata"),"Choanoflagellatea",
                                    tax_table_df$Class)))) -> tax_table_df

tax_table_df <- column_to_rownames(tax_table_df,"ASV")

Load master asv_table, metadata, and tax_table objects

asv_table <- otu_table(asv_table,taxa_are_rows = TRUE)
sample_data <- sample_data(metadata_updated)
tax_table <- tax_table(as.matrix(tax_table_df))
master_phyloseq_18S <- merge_phyloseq(asv_table,sample_data,tax_table)

Remove terrestrial contaminants

## Decontaminate
# no humans
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Family != "Hominidae"
 )
# no cows
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Family != "Bovidae"
 )
# no pigs
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Family != "Suidae"
 )
# no dogs
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Family != "Canidae"
 )
# no chickens
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Genus != "Gallus"
 )
# no turkeys
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Genus != "Meleagris"
 )
# no cats
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Family != "Felidae"
 )
# no rats!! and other rodents
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Order != "Rodentia"
 )
# no rabbits
master_phyloseq_18S <- master_phyloseq_18S %>%
 subset_taxa(
   Order != "Lagomorpha"
 )

master_phyloseq_18S <- prune_taxa(taxa_sums(master_phyloseq_18S) > 0,master_phyloseq_18S)
master_phyloseq_18S

Subset KOSMOS samples and export data

# Subset out KOSMOS samples
kosmos_18S_phyloseq <- prune_samples(sample_data(master_phyloseq_18S)$SAMPLING_project %in% c("KOSMOS"),master_phyloseq_18S)
kosmos_18S_phyloseq <- prune_taxa(taxa_sums(kosmos_18S_phyloseq)>0,kosmos_18S_phyloseq)

# kosmos_18S <- left_join(kosmos_18S_asv_table, kosmos_18S_tax_table, by = "ASV")
# write.csv(kosmos_18S, "/Users/mmin/Documents/KOSMOS_data_local/kosmos_18S_061620.csv")

# EXPORT FILES FOR UPLOAD TO GITHUB
# ASV table
kosmos_18S_asv_table <- as.data.frame(as(otu_table(kosmos_18S_phyloseq),"matrix"))
rownames_to_column(kosmos_18S_asv_table,"ASV") -> kosmos_18S_asv_table
# tax table
kosmos_18S_tax_table <- as.data.frame(as(tax_table(kosmos_18S_phyloseq),"matrix"))
rownames_to_column(kosmos_18S_tax_table,"ASV") -> kosmos_18S_tax_table
# metadata
kosmos_18S_metadata <- as.data.frame(as.matrix(sample_data(kosmos_18S_phyloseq)))
rownames_to_column(kosmos_18S_metadata,"SampleID") -> kosmos_18S_metadata

# export three files
write.csv(kosmos_18S_asv_table, here::here("data", "kosmos_eDNA_data", "kosmos_18S_asv_table.csv"), row.names = FALSE)
write.csv(kosmos_18S_tax_table, here::here("data", "kosmos_eDNA_data", "kosmos_18S_tax_table.csv"), row.names = FALSE)
write.csv(kosmos_18S_metadata, here::here("data", "kosmos_eDNA_data", "kosmos_18S_metadata.csv"), row.names = FALSE)

load KOSMOS data

# read data
kosmos_18S_asv_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_18S_asv_table.csv"), row.names = 1)
kosmos_18S_tax_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_18S_tax_table.csv"), row.names = 1)
kosmos_18S_metadata <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_18S_metadata.csv"), row.names = 1)


# reformat and join as phyloseq object
asv_table <- otu_table(kosmos_18S_asv_table,taxa_are_rows = TRUE)
sample_data <- sample_data(kosmos_18S_metadata)
tax_table <- tax_table(as.matrix(kosmos_18S_tax_table))
kosmos_18S_phyloseq <- merge_phyloseq(asv_table,sample_data,tax_table)

# Subset only environmental samples (non-controls)
kosmos_18S_16_phyloseq <- prune_samples(sample_data(kosmos_18S_phyloseq)$SAMPLING_station_number %in% c("M1","MP"), kosmos_18S_phyloseq)
kosmos_18S_16_phyloseq <- prune_taxa(taxa_sums(kosmos_18S_16_phyloseq)>0,kosmos_18S_16_phyloseq)

12S

Read in metadata, make edits (TD + Classic)

metadata <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/12S/for_MBON/decontaminated/092120/12S_master_metadata.csv")
metadata$sample_name <- as.character(metadata$sample_name)

metadata <- subset(metadata, sample_type %in% c("environmental"))

# Drop 67-70 for the C3PO19 cruise
metadata <- subset(metadata, !(SAMPLING_station %in% c("67-70") & SAMPLING_cruise == "C3PO19"))

# Flip the depths for station 67-60 to fix them.
metadata$depth <- as.numeric(as.character(metadata$depth))
metadata %>% mutate(.,depth = ifelse(metadata$sample_name %in% c("C3PO19c28_12_eDNA_QQ"),150,
                                            ifelse(metadata$sample_name %in% c("C3PO19c28_9_eDNA_QQ"),100,
                                                   ifelse(metadata$sample_name %in% c("C3PO19c28_6_eDNA_QQ"),60,
                                                          ifelse(metadata$sample_name %in% c("C3PO19c28_4_eDNA_QQ"),20,
                                                                 ifelse(metadata$sample_name %in% c("C3PO19c28_3_eDNA_QQ"),0,depth)))))) -> metadata

# Drop 67-70
metadata <- subset(metadata, !(SAMPLING_station %in% c("67-70")))

## Create a new field for sampling device, since the metadata has different formats, but always has a SAMPLING_platform field that we can use to determine what the sampling device was.

# netfilts <- grepl('netfilt|bongo|Bongo', metadata$sample_name)
# netfilt_samples <- metadata$sample_name[netfilts]

netfilts <- grepl('netfilt|bongo|Bongo', metadata$sample_name)
netfilt_samples <- subset(metadata, metadata$sample_name %in% metadata$sample_name[netfilts])
netfilt_sample_names <- netfilt_samples$sample_name

metadata_updated <- metadata %>%
  mutate(.,sampling_device = ifelse(sample_type %in% c("positive","negative","blank"),"CONTROL",
    ifelse(sample_name %in% netfilt_sample_names,"Net filtrate",
    ifelse(SAMPLING_platform %in% c("WESTERN FLYER","MCARTHUR II",  "RACHEL CARSON", "FULMAR", "WESTERNFLYER"," WESTERN FLYER", "POINT LOBOS", "Point Lobos","Point lobos",  "Point Sur",  "McArthur II", "JohnMartin","JOHN MARTIN","POINT SUR","rachel carson","point lobos","McArthur II", "Western Flyer", "Fulmar",  "Rachel Carson", "Rachel carson","CPF","POINT 0"),"Shipboard Niskin",
    ifelse(SAMPLING_platform %in% c("REUBEN LASKER"),"Net filtrate",
    ifelse(SAMPLING_cruise %in% c("Flyer2018", "Lasker2018"),"Shipboard Niskin",
    ifelse(SAMPLING_platform %in% c("DocRicketts"),"ROV",
    ifelse(SAMPLING_platform %in% c("Mesocosm"),"Mesocosm",
    ifelse(SAMPLING_platform %in% c("SCUBA") | SAMPLING_platform_type %in% c("scuba"),"SCUBA",
    ifelse(SAMPLING_platform %in% c("Tethys","MAKAI","DORADO","AKU"),"3G ESP",metadata$sample_type))))))))))

metadata_updated$SAMPLING_cruise <- as.character(metadata_updated$SAMPLING_cruise)
metadata_updated$SAMPLING_campaign <- as.character(metadata_updated$SAMPLING_campaign)
metadata_updated$project_name <- as.character(metadata_updated$project_name)

metadata_updated %>% mutate(.,project_new = ifelse(SAMPLING_cruise %in% c("CANON11","CANON13","CANON15","CANON16","CANON17S","CN12S","CN18F","C3PO19","CN19S","Lasker2018","Flyer2018"),SAMPLING_cruise,
                                      ifelse(!(SAMPLING_campaign %in% c("")), SAMPLING_campaign,
                                      project_name))) -> metadata_updated

## Project
metadata_updated$SAMPLING_cruise <- as.character(metadata_updated$SAMPLING_cruise)
metadata_updated$SAMPLING_campaign <- as.character(metadata_updated$SAMPLING_campaign)
metadata_updated$project_name <- as.character(metadata_updated$project_name)

metadata_updated %>% mutate(.,project_new = ifelse(SAMPLING_cruise %in% c("CANON11","CANON13","CANON15","CANON16","CANON17S","CN12S","CN18F","C3PO19","CN19S","Lasker2018","Flyer2018"),SAMPLING_cruise,
                                      ifelse(!(SAMPLING_campaign %in% c("")), SAMPLING_campaign,
                                      project_name))) -> metadata_updated


## Location
metadata_updated$geo_loc_name <- as.character(metadata_updated$geo_loc_name)

# Drop netfilt samples
metadata_updated <- subset(metadata_updated, !(sample_name %in% netfilt_sample_names))
metadata_updated <- subset(metadata_updated, !(sampling_device %in% c("Net filtrate")))

## Depth as categorical
# sort(metadata_updated$depth)
metadata_updated %>% mutate(.,depth_cat = ifelse(metadata_updated$depth >= 0 & metadata_updated$depth <= 5.5,"0-5",
            ifelse(metadata_updated$depth > 5.5 & metadata_updated$depth <= 20,"6-20",
            ifelse(metadata_updated$depth > 20 & metadata_updated$depth < 29.5,"20-29",
            ifelse(metadata_updated$depth >= 29.5 & metadata_updated$depth < 40,"30-39",
            ifelse(metadata_updated$depth >= 40 & metadata_updated$depth < 60,"40-59",
            ifelse(metadata_updated$depth >= 60 & metadata_updated$depth <= 80,"60-80",
            ifelse(metadata_updated$depth >= 100 & metadata_updated$depth <= 199,"100-199",
            ifelse(metadata_updated$depth > 199 & metadata_updated$depth <= 250,"200-250",
            ifelse(metadata_updated$depth >= 280 & metadata_updated$depth <= 500,"280-500",
            ifelse(metadata_updated$depth >= 600 & metadata_updated$depth <= 700,"600-700",
            "unknown"))))))))))) -> metadata_updated 

metadata_updated$depth_cat <- factor(metadata_updated$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))

# Use lubridate to convert date/time to posixct
metadata_updated$SAMPLING_date_time <- as.character(metadata_updated$SAMPLING_date_time)
metadata_updated %>% mutate(.,date_time_updated = parse_date_time(SAMPLING_date_time,c("ymdHM","mdyHM","ymdHMS"))) -> metadata_updated

class(metadata_updated$date_time_updated)
# Let's add in a month field, so we can try to visualize seasonality
metadata_updated %>% mutate(.,month_updated = month(date_time_updated,label = TRUE, abbr = FALSE)) -> metadata_updated
metadata_updated$month_updated <- as.character(metadata_updated$month_updated)

# Let's add a season field
metadata_updated %>% mutate(.,season = ifelse(month_updated %in% c("November","December","January"),"Winter",
                                              ifelse(month_updated %in% c("February","March","April"),"Spring",
                                                     ifelse(month_updated %in% c("May","June","July"),"Summer",
                                                            ifelse(month_updated %in% c("August","September","October"),"Fall",month_updated))))) -> metadata_updated
metadata_updated$season <- factor(metadata_updated$season, levels = c("Winter","Spring","Summer","Fall"))

metadata_updated$month_updated <- factor(metadata_updated$month_updated, levels = c("January", "February", "March", "April", "May", "June","July","August","September","October","November","December"))

# Let's add a year field
metadata_updated %>% mutate(.,year_updated = year(date_time_updated)) -> metadata_updated

metadata_updated <- metadata_updated[!(is.na(metadata_updated$sample_name)),]

# Rearrange so that the touchdown replicates are first, and then drop the classic replicates
metadata_updated %>% dplyr::arrange(.,desc(PCR_settings)) %>% distinct(.,original_name,.keep_all = TRUE) -> metadata_updated


# Change sample_name to row names
rownames(metadata_updated) <- NULL
metadata_updated <- column_to_rownames(metadata_updated,"sample_name")

Fix asv table so that there aren’t any leading Xs in the column names (TD only)

asv_table <- read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/12S/for_MBON/decontaminated/092120/12S_master_ASV_table.csv",row.names = 1, check.names = FALSE)
destroyX = function(es) {
  f = es
  for (col in c(1:ncol(f))){ #for each column in dataframe
    if (startsWith(colnames(f)[col], "X") == TRUE)  { #if starts with 'X' ..
      colnames(f)[col] <- substr(colnames(f)[col], 2, 100) #get rid of it
    }
  }
  assign(deparse(substitute(es)), f, inherits = TRUE) #assign corrected data to original name
}
asv_table <- destroyX(asv_table)

Load master asv_table, metadata, and tax_table objects

asv_table <- otu_table(asv_table,taxa_are_rows = TRUE)
# rownames(metadata_updated) <- NULL
# metadata_updated <- column_to_rownames(metadata_updated,"sample_name")
sample_data <- sample_data(metadata_updated)
tax_table <- tax_table(as.matrix(read.csv(file = "/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/12S/for_MBON/decontaminated/092120/12S_master_tax_table.csv",row.names = 1)))
master_phyloseq_allplates <- merge_phyloseq(asv_table,sample_data,tax_table)

Prune phyloseq object

  • Remove plate II (same samples are on plate QQ)
  • Drop the classic sample replicates from plate QQ
master_phyloseq_allplates <- prune_samples(sample_data(master_phyloseq_allplates)$libraryID != "II",master_phyloseq_allplates)
master_phyloseq_allplates <- prune_samples(!(sample_data(master_phyloseq_allplates)$libraryID == "QQ" & sample_data(master_phyloseq_allplates)$PCR_settings == "classic"),master_phyloseq_allplates)

Check summary statistics

nsamples(master_phyloseq_allplates)
ntaxa(master_phyloseq_allplates)
sum(sample_sums(master_phyloseq_allplates))

Subset out Bacteria and Archaea, terrestrial contaminants

#phyloseq_species <- tax_glom(master_phyloseq,taxrank=rank_names(phyloseq)[7], NArm = FALSE)

master_phyloseq_12S <- master_phyloseq_allplates %>%
 subset_taxa(
   !(Kingdom %in% c("Archaea","Bacteria"))
 )

## Decontaminate
# no humans
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Family != "Hominidae"
 )
# no cows
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Family != "Bovidae"
 )
# no pigs
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Family != "Suidae"
 )
# no dogs
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Family != "Canidae"
 )
# no chickens
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Genus != "Gallus"
 )
# no turkeys
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Genus != "Meleagris"
 )
# no cats
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Order != "Carnivora"
 )
# no rats!! and other rodents
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Family != "Felidae"
 )
# no rabbits
master_phyloseq_12S <- master_phyloseq_12S %>%
 subset_taxa(
   Order != "Lagomorpha"
 )

Subset KOSMOS samples and export data

# Subset out KOSMOS samples
kosmos_12S_phyloseq <- prune_samples(sample_data(master_phyloseq_12S)$SAMPLING_project %in% c("KOSMOS"),master_phyloseq_12S)
kosmos_12S_phyloseq <- prune_taxa(taxa_sums(kosmos_12S_phyloseq)>0,kosmos_12S_phyloseq)

# kosmos_12S <- left_join(kosmos_12S_asv_table, kosmos_12S_tax_table, by = "ASV")
# write.csv(kosmos_12S, "/Users/mmin/Documents/KOSMOS_data_local/kosmos_12S_061620.csv")

# EXPORT FILES FOR UPLOAD TO GITHUB
# ASV table
kosmos_12S_asv_table <- as.data.frame(as(otu_table(kosmos_12S_phyloseq),"matrix"))
rownames_to_column(kosmos_12S_asv_table,"ASV") -> kosmos_12S_asv_table
# tax table
kosmos_12S_tax_table <- as.data.frame(as(tax_table(kosmos_12S_phyloseq),"matrix"))
rownames_to_column(kosmos_12S_tax_table,"ASV") -> kosmos_12S_tax_table
# metadata
kosmos_12S_metadata <- as.data.frame(as.matrix(sample_data(kosmos_12S_phyloseq)))
rownames_to_column(kosmos_12S_metadata,"SampleID") -> kosmos_12S_metadata

# export three files
write.csv(kosmos_12S_asv_table, here::here("data", "kosmos_eDNA_data", "kosmos_12S_asv_table.csv"), row.names = FALSE)
write.csv(kosmos_12S_tax_table, here::here("data", "kosmos_eDNA_data", "kosmos_12S_tax_table.csv"), row.names = FALSE)
write.csv(kosmos_12S_metadata, here::here("data", "kosmos_eDNA_data", "kosmos_12S_metadata.csv"), row.names = FALSE)

load KOSMOS data

# read data
kosmos_12S_asv_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_12S_asv_table.csv"), row.names = 1)
kosmos_12S_tax_table <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_12S_tax_table.csv"), row.names = 1)
kosmos_12S_metadata <- read.csv(here::here("data", "kosmos_eDNA_data", "kosmos_12S_metadata.csv"), row.names = 1)


# reformat and join as phyloseq object
asv_table <- otu_table(kosmos_12S_asv_table,taxa_are_rows = TRUE)
sample_data <- sample_data(kosmos_12S_metadata)
tax_table <- tax_table(as.matrix(kosmos_12S_tax_table))
kosmos_12S_phyloseq <- merge_phyloseq(asv_table,sample_data,tax_table)

# Subset only environmental samples (non-controls)
kosmos_12S_16_phyloseq <- prune_samples(sample_data(kosmos_12S_phyloseq)$SAMPLING_station_number %in% c("M1","MP"), kosmos_12S_phyloseq)
kosmos_12S_16_phyloseq <- prune_taxa(taxa_sums(kosmos_12S_16_phyloseq)>0,kosmos_12S_16_phyloseq)

16S rRNA: Plastidial (chloroplast) Reads

Load in phyloseq object - chloroplasts

chloroplast_asv <- read.csv(here::here("data", "Worden_lab_data", "kosmos_chloroplast_asv_table.csv"),row.names = 1)
rownames(chloroplast_asv) <- NULL
chloroplast_asv %>% column_to_rownames(.,"asv") -> chloroplast_asv
kosmos_16S_metadata <- read.csv(here::here("data", "Worden_lab_data", "KOSMOS_sample_data_16S.csv"),row.names = 1)
chloroplast_tax_table <- read.csv(here::here("data", "Worden_lab_data", "kosmos_chloroplast_tax_table_noX.csv"), row.names = 1)
rownames(chloroplast_tax_table) <- NULL
chloroplast_tax_table %>% column_to_rownames(.,"asv") -> chloroplast_tax_table
chloroplast_phyloseq <- merge_phyloseq(otu_table(chloroplast_asv,taxa_are_rows = TRUE),tax_table(as.matrix(chloroplast_tax_table)),sample_data(kosmos_16S_metadata))

# Subset only M1 and Pacific samples
chloroplast_16_phyloseq <- prune_samples(sample_data(chloroplast_phyloseq)$SAMPLING_station_number %in% c("M1","MP"), chloroplast_phyloseq)
# Drop all taxa with zero reads
chloroplast_16_phyloseq <- prune_taxa(taxa_sums(chloroplast_16_phyloseq)>0,chloroplast_16_phyloseq)

16S rRNA: Bacterial Reads

Change bacterial taxonomy, so that entries like “uncultured” and “unidentified” aren’t counted as taxonomy.

bacteria_tax_table <- read.csv(here::here("data", "Worden_lab_data", "kosmos_bacteria_tax_table.csv"), row.names = 1)

bacteria_tax_table$Kingdom <- as.character(bacteria_tax_table$Kingdom)
bacteria_tax_table$Phylum <- as.character(bacteria_tax_table$Phylum)
bacteria_tax_table$Class <- as.character(bacteria_tax_table$Class)
bacteria_tax_table$Order <- as.character(bacteria_tax_table$Order)
bacteria_tax_table$Family <- as.character(bacteria_tax_table$Family)
bacteria_tax_table$Genus <- as.character(bacteria_tax_table$Genus)
bacteria_tax_table$Species <- as.character(bacteria_tax_table$Species)

# Use a for loop to change all unwanted values to blank

unwanted_taxonomy <- c("uncultured","ambiguous","unidentified","metagenome")
columns <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
for (i in seq(1,length(unwanted_taxonomy))){
  bacteria_tax_table$Species[grep(unwanted_taxonomy[i], bacteria_tax_table$Species,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Genus[grep(unwanted_taxonomy[i], bacteria_tax_table$Genus,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Family[grep(unwanted_taxonomy[i], bacteria_tax_table$Family,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Order[grep(unwanted_taxonomy[i], bacteria_tax_table$Order,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Class[grep(unwanted_taxonomy[i], bacteria_tax_table$Class,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Phylum[grep(unwanted_taxonomy[i], bacteria_tax_table$Phylum,ignore.case = TRUE)] <- ""
  bacteria_tax_table$Kingdom[grep(unwanted_taxonomy[i], bacteria_tax_table$Kingdom,ignore.case = TRUE)] <- ""
}

Load in phyloseq object - bacteria

bacteria_asv <- read.csv(here::here("data", "Worden_lab_data", "kosmos_bacteria_asv_table.csv"),row.names = 1)
# bacteria_asv %>% column_to_rownames(.,"asv") -> bacteria_asv
kosmos_16S_metadata <- read.csv(here::here("data", "Worden_lab_data", "KOSMOS_sample_data_16S.csv"),row.names = 1)
# bacteria_tax_table <- read.csv("/Users/mmin/Documents/KOSMOS_data_local/Worden_lab_data/kosmos_bacteria_tax_table.csv", row.names = 1)
bacteria_phyloseq <- merge_phyloseq(otu_table(bacteria_asv,taxa_are_rows = TRUE),tax_table(as.matrix(bacteria_tax_table)),sample_data(kosmos_16S_metadata))

# Subset out only M1 and Pacific samples
bacteria_16_phyloseq <- prune_samples(sample_data(bacteria_phyloseq)$SAMPLING_station_number %in% c("M1","MP"), bacteria_phyloseq)
# Remove all taxa that have zero reads
bacteria_16_phyloseq <- prune_taxa(taxa_sums(bacteria_16_phyloseq) > 0,bacteria_16_phyloseq)

Check on Synechococcus and Prochlorococcus read counts

bacteria_16_phyloseq_pro <- subset_taxa(bacteria_16_phyloseq, Genus == "Prochlorococcus_MIT9313")
bacteria_16_phyloseq_syn <- subset_taxa(bacteria_16_phyloseq, Genus %in% c("Synechococcus_PCC-7902","Synechococcus_CC9902"))

sample_sums(bacteria_16_phyloseq_pro)
##  KOSMOS.D1.M1BS  KOSMOS.D1.MPBS KOSMOS.D15.M1AS KOSMOS.D15.MPAS  KOSMOS.D24.M1A 
##              61             191              57             309              29 
## KOSMOS.D24.MPAS KOSMOS.D32.M1AS KOSMOS.D32.MPAS KOSMOS.D36.M1AS KOSMOS.D36.MPAS 
##              21              33              88              25              57 
## KOSMOS.D42.M1AS KOSMOS.D42.MPAS KOSMOS.D48.M1AS KOSMOS.D48.MPAS  KOSMOS.D8.M1AS 
##               0             286              11             145              25 
##  KOSMOS.D8.MPAS 
##            2577
sample_sums(bacteria_16_phyloseq_syn)
##  KOSMOS.D1.M1BS  KOSMOS.D1.MPBS KOSMOS.D15.M1AS KOSMOS.D15.MPAS  KOSMOS.D24.M1A 
##           23446           16508            1410            5798             760 
## KOSMOS.D24.MPAS KOSMOS.D32.M1AS KOSMOS.D32.MPAS KOSMOS.D36.M1AS KOSMOS.D36.MPAS 
##            2445             485            2013             112            3069 
## KOSMOS.D42.M1AS KOSMOS.D42.MPAS KOSMOS.D48.M1AS KOSMOS.D48.MPAS  KOSMOS.D8.M1AS 
##             121            1825             609            2120             983 
##  KOSMOS.D8.MPAS 
##           10254

Overview statistics (Table 1)

COI: Summary statistics

All samples combined

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_COI_16_phyloseq)
ntaxa <- ntaxa(kosmos_COI_16_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_COI_16_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_COI_16_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_COI <- data.frame(statistic_names,statistic_values)

overview_statistics_COI
##     statistic_names statistic_values
## 1 Number of Samples               16
## 2    Number of ASVs             2527
## 3   Number of Reads          1236508
## 4             Phyla               30
## 5           Classes               67
## 6            Orders              144
## 7          Families              228
## 8            Genera               72
## 9           Species               58
COI_phyla <- unique(tax_table_all.df$Phylum)
COI_genera <- unique(tax_table_all.df$Genus)

Just M1

kosmos_COI_M1_phyloseq <- prune_samples(sample_data(kosmos_COI_16_phyloseq)$SAMPLING_station_number == "M1", kosmos_COI_16_phyloseq)
kosmos_COI_M1_phyloseq <-  prune_taxa(taxa_sums(kosmos_COI_M1_phyloseq)>0,kosmos_COI_M1_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_COI_M1_phyloseq)
ntaxa <- ntaxa(kosmos_COI_M1_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_COI_M1_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_COI_M1_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_COI_M1 <- data.frame(statistic_names,statistic_values)

overview_statistics_COI_M1
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             1784
## 3   Number of Reads           635388
## 4             Phyla               25
## 5           Classes               57
## 6            Orders              117
## 7          Families              166
## 8            Genera               48
## 9           Species               41
# Calculate mean + SD
kosmos_COI_M1_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_COI_M1_phyloseq))))
kosmos_COI_M1_sample_sums <- rownames_to_column(kosmos_COI_M1_sample_sums,"sample")
colnames(kosmos_COI_M1_sample_sums)[2] <- "reads"

summarySE(kosmos_COI_M1_sample_sums,measurevar = "reads")
##    .id N   reads       sd       se       ci
## 1 <NA> 8 79423.5 22805.56 8062.984 19065.93

Just Pacific

kosmos_COI_MP_phyloseq <- prune_samples(sample_data(kosmos_COI_16_phyloseq)$SAMPLING_station_number == "MP", kosmos_COI_16_phyloseq)
kosmos_COI_MP_phyloseq <-  prune_taxa(taxa_sums(kosmos_COI_MP_phyloseq)>0,kosmos_COI_MP_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_COI_MP_phyloseq)
ntaxa <- ntaxa(kosmos_COI_MP_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_COI_MP_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_COI_MP_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_COI_MP <- data.frame(statistic_names,statistic_values)

overview_statistics_COI_MP
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             1961
## 3   Number of Reads           601120
## 4             Phyla               28
## 5           Classes               61
## 6            Orders              128
## 7          Families              195
## 8            Genera               57
## 9           Species               49
# Calculate mean + SD
kosmos_COI_MP_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_COI_MP_phyloseq))))
kosmos_COI_MP_sample_sums <- rownames_to_column(kosmos_COI_MP_sample_sums,"sample")
colnames(kosmos_COI_MP_sample_sums)[2] <- "reads"

summarySE(kosmos_COI_MP_sample_sums,measurevar = "reads")
##    .id N reads       sd       se       ci
## 1 <NA> 8 75140 6205.367 2193.929 5187.817

18S: Summary statistics

All samples combined

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_18S_16_phyloseq)
ntaxa <- ntaxa(kosmos_18S_16_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_18S_16_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_18S_16_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_18S <- data.frame(statistic_names,statistic_values)

overview_statistics_18S
##     statistic_names statistic_values
## 1 Number of Samples               16
## 2    Number of ASVs             3296
## 3   Number of Reads          1759671
## 4             Phyla               41
## 5           Classes               95
## 6            Orders              183
## 7          Families              254
## 8            Genera              269
## 9           Species              262
phyla_18S <- unique(tax_table_all.df$Phylum)
genera_18S <- unique(tax_table_all.df$Genus)

Just M1

kosmos_18S_M1_phyloseq <- prune_samples(sample_data(kosmos_18S_16_phyloseq)$SAMPLING_station_number == "M1", kosmos_18S_16_phyloseq)
kosmos_18S_M1_phyloseq <-  prune_taxa(taxa_sums(kosmos_18S_M1_phyloseq)>0,kosmos_18S_M1_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_18S_M1_phyloseq)
ntaxa <- ntaxa(kosmos_18S_M1_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_18S_M1_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_18S_M1_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_18S_M1 <- data.frame(statistic_names,statistic_values)

overview_statistics_18S_M1
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             2105
## 3   Number of Reads          1006473
## 4             Phyla               31
## 5           Classes               76
## 6            Orders              139
## 7          Families              189
## 8            Genera              214
## 9           Species              196
# Calculate mean + SD
kosmos_18S_M1_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_18S_M1_phyloseq))))
kosmos_18S_M1_sample_sums <- rownames_to_column(kosmos_18S_M1_sample_sums,"sample")
colnames(kosmos_18S_M1_sample_sums)[2] <- "reads"

summarySE(kosmos_18S_M1_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 125809.1 26365.08 9321.464 22041.76

Just Pacific

kosmos_18S_MP_phyloseq <- prune_samples(sample_data(kosmos_18S_16_phyloseq)$SAMPLING_station_number == "MP", kosmos_18S_16_phyloseq)
kosmos_18S_MP_phyloseq <-  prune_taxa(taxa_sums(kosmos_18S_MP_phyloseq)>0,kosmos_18S_MP_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_18S_MP_phyloseq)
ntaxa <- ntaxa(kosmos_18S_MP_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_18S_MP_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_18S_MP_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_18S_MP <- data.frame(statistic_names,statistic_values)

overview_statistics_18S_MP
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             2825
## 3   Number of Reads           753198
## 4             Phyla               41
## 5           Classes               89
## 6            Orders              169
## 7          Families              226
## 8            Genera              241
## 9           Species              233
# Calculate mean + SD
kosmos_18S_MP_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_18S_MP_phyloseq))))
kosmos_18S_MP_sample_sums <- rownames_to_column(kosmos_18S_MP_sample_sums,"sample")
colnames(kosmos_18S_MP_sample_sums)[2] <- "reads"

summarySE(kosmos_18S_MP_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 94149.75 19034.32 6729.649 15913.09

12S: Summary statistics

All samples combined

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_12S_16_phyloseq)
ntaxa <- ntaxa(kosmos_12S_16_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_12S_16_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_12S_16_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for genus, thus now -3
ngenera <- length(unique(tax_table_all.df$Genus)) - 3
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, thus now -3
nspecies <- length(unique(tax_table_all.df$Species)) - 3
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_12S <- data.frame(statistic_names,statistic_values)

overview_statistics_12S
##     statistic_names statistic_values
## 1 Number of Samples               16
## 2    Number of ASVs              445
## 3   Number of Reads           216841
## 4             Phyla                0
## 5           Classes                2
## 6            Orders               21
## 7          Families               30
## 8            Genera               31
## 9           Species               25
phyla_12S <- unique(tax_table_all.df$Phylum)
genera_12S <- unique(tax_table_all.df$Genus)

Just M1

kosmos_12S_M1_phyloseq <- prune_samples(sample_data(kosmos_12S_16_phyloseq)$SAMPLING_station_number == "M1", kosmos_12S_16_phyloseq)
kosmos_12S_M1_phyloseq <-  prune_taxa(taxa_sums(kosmos_12S_M1_phyloseq)>0,kosmos_12S_M1_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_12S_M1_phyloseq)
ntaxa <- ntaxa(kosmos_12S_M1_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_12S_M1_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_12S_M1_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unassigned", "unknown", and "no_hit",
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, but there is a value for "bacterium, thus -4
nspecies <- length(unique(tax_table_all.df$Species)) - 4
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_12S_M1 <- data.frame(statistic_names,statistic_values)

overview_statistics_12S_M1
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs              348
## 3   Number of Reads           105635
## 4             Phyla               -1
## 5           Classes                1
## 6            Orders               15
## 7          Families               21
## 8            Genera               23
## 9           Species               20
# Calculate mean + SD
kosmos_12S_M1_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_12S_M1_phyloseq))))
kosmos_12S_M1_sample_sums <- rownames_to_column(kosmos_12S_M1_sample_sums,"sample")
colnames(kosmos_12S_M1_sample_sums)[2] <- "reads"

summarySE(kosmos_12S_M1_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 13204.38 9063.226 3204.334 7577.047

Just Pacific

kosmos_12S_MP_phyloseq <- prune_samples(sample_data(kosmos_12S_16_phyloseq)$SAMPLING_station_number == "MP", kosmos_12S_16_phyloseq)
kosmos_12S_MP_phyloseq <-  prune_taxa(taxa_sums(kosmos_12S_MP_phyloseq)>0,kosmos_12S_MP_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(kosmos_12S_MP_phyloseq)
ntaxa <- ntaxa(kosmos_12S_MP_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(kosmos_12S_MP_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(kosmos_12S_MP_phyloseq),"matrix"))
# "unknown", "unassigned", and "no_hit" are unique values for taxonomy, thus -3 for number of unique counts
nphyla <- length(unique(tax_table_all.df$Phylum)) - 3
nclasses <- length(unique(tax_table_all.df$Class)) - 3
norders <- length(unique(tax_table_all.df$Order)) - 3
nfamilies <- length(unique(tax_table_all.df$Family)) - 3 
# g_ is a unique value in addition to "unknown", "unassigned", and "no_hit", thus now -4
ngenera <- length(unique(tax_table_all.df$Genus)) - 4
# s_ is a unique value in addition to "unassigned", and "no_hit", but there is no "unknown value for species, but there is a value for "bacterium, thus -4
nspecies <- length(unique(tax_table_all.df$Species)) - 4
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_12S_MP <- data.frame(statistic_names,statistic_values)

overview_statistics_12S_MP
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs              246
## 3   Number of Reads           111206
## 4             Phyla                0
## 5           Classes                2
## 6            Orders               20
## 7          Families               26
## 8            Genera               24
## 9           Species               18
# Calculate mean + SD
kosmos_12S_MP_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(kosmos_12S_MP_phyloseq))))
kosmos_12S_MP_sample_sums <- rownames_to_column(kosmos_12S_MP_sample_sums,"sample")
colnames(kosmos_12S_MP_sample_sums)[2] <- "reads"

summarySE(kosmos_12S_MP_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 13900.75 6019.894 2128.354 5032.758

16S rRNA chloroplast sequences: Summary statistics

All samples combined

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(chloroplast_16_phyloseq)
ntaxa <- ntaxa(chloroplast_16_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(chloroplast_16_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(chloroplast_16_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_chloroplast <- data.frame(statistic_names,statistic_values)

overview_statistics_chloroplast
##     statistic_names statistic_values
## 1 Number of Samples               16
## 2    Number of ASVs              258
## 3   Number of Reads           241783
## 4             Phyla                7
## 5           Classes               14
## 6            Orders               16
## 7          Families               15
## 8            Genera                4
## 9           Species                5
chloroplast_phyla <- unique(tax_table_all.df$Phylum)
chloroplast_genera <- unique(tax_table_all.df$Genus)

Just M1

chloroplast_M1_phyloseq <- prune_samples(sample_data(chloroplast_16_phyloseq)$SAMPLING_station_number == "M1", chloroplast_16_phyloseq)
chloroplast_M1_phyloseq <-  prune_taxa(taxa_sums(chloroplast_M1_phyloseq)>0,chloroplast_M1_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(chloroplast_M1_phyloseq)
ntaxa <- ntaxa(chloroplast_M1_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(chloroplast_M1_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(chloroplast_M1_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_chloroplast_M1 <- data.frame(statistic_names,statistic_values)

overview_statistics_chloroplast_M1
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs              158
## 3   Number of Reads            73050
## 4             Phyla                5
## 5           Classes               13
## 6            Orders               15
## 7          Families               14
## 8            Genera                2
## 9           Species                2
# Calculate mean + SD
kosmos_chloroplast_M1_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(chloroplast_M1_phyloseq))))
kosmos_chloroplast_M1_sample_sums <- rownames_to_column(kosmos_chloroplast_M1_sample_sums,"sample")
colnames(kosmos_chloroplast_M1_sample_sums)[2] <- "reads"

summarySE(kosmos_chloroplast_M1_sample_sums,measurevar = "reads")
##    .id N   reads       sd       se       ci
## 1 <NA> 8 9131.25 6511.553 2302.182 5443.794

Just Pacific

chloroplast_MP_phyloseq <- prune_samples(sample_data(chloroplast_16_phyloseq)$SAMPLING_station_number == "MP", chloroplast_16_phyloseq)
chloroplast_MP_phyloseq <-  prune_taxa(taxa_sums(chloroplast_MP_phyloseq)>0,chloroplast_MP_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(chloroplast_MP_phyloseq)
ntaxa <- ntaxa(chloroplast_MP_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(chloroplast_MP_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(chloroplast_MP_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_chloroplast_MP <- data.frame(statistic_names,statistic_values)

overview_statistics_chloroplast_MP
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs              203
## 3   Number of Reads           168733
## 4             Phyla                7
## 5           Classes               13
## 6            Orders               16
## 7          Families               15
## 8            Genera                4
## 9           Species                5
# Calculate mean + SD
kosmos_chloroplast_MP_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(chloroplast_MP_phyloseq))))
kosmos_chloroplast_MP_sample_sums <- rownames_to_column(kosmos_chloroplast_MP_sample_sums,"sample")
colnames(kosmos_chloroplast_MP_sample_sums)[2] <- "reads"

summarySE(kosmos_chloroplast_MP_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 21091.62 10737.76 3796.371 8976.992

16S bacterial sequences: Summary statistics

All samples

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(bacteria_16_phyloseq)
ntaxa <- ntaxa(bacteria_16_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(bacteria_16_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(bacteria_16_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_bacteria <- data.frame(statistic_names,statistic_values)

overview_statistics_bacteria
##     statistic_names statistic_values
## 1 Number of Samples               16
## 2    Number of ASVs             5786
## 3   Number of Reads          3125836
## 4             Phyla               38
## 5           Classes               74
## 6            Orders              174
## 7          Families              256
## 8            Genera              478
## 9           Species               16
bacteria_phyla <- unique(tax_table_all.df$Phylum)
bacteria_genera <- unique(tax_table_all.df$Genus)

Just M1

bacteria_M1_phyloseq <- prune_samples(sample_data(bacteria_16_phyloseq)$SAMPLING_station_number == "M1", bacteria_16_phyloseq)
bacteria_M1_phyloseq <-  prune_taxa(taxa_sums(bacteria_M1_phyloseq)>0,bacteria_M1_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(bacteria_M1_phyloseq)
ntaxa <- ntaxa(bacteria_M1_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(bacteria_M1_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(bacteria_M1_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_bacteria_M1 <- data.frame(statistic_names,statistic_values)

overview_statistics_bacteria_M1
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             3223
## 3   Number of Reads          1624665
## 4             Phyla               24
## 5           Classes               43
## 6            Orders              130
## 7          Families              183
## 8            Genera              348
## 9           Species               10
# Calculate mean + SD
kosmos_bacteria_M1_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(bacteria_M1_phyloseq))))
kosmos_bacteria_M1_sample_sums <- rownames_to_column(kosmos_bacteria_M1_sample_sums,"sample")
colnames(kosmos_bacteria_M1_sample_sums)[2] <- "reads"

summarySE(kosmos_bacteria_M1_sample_sums,measurevar = "reads")
##    .id N    reads      sd       se       ci
## 1 <NA> 8 203083.1 20875.4 7380.568 17452.27

Just Pacific

bacteria_MP_phyloseq <- prune_samples(sample_data(bacteria_16_phyloseq)$SAMPLING_station_number == "MP", bacteria_16_phyloseq)
bacteria_MP_phyloseq <-  prune_taxa(taxa_sums(bacteria_MP_phyloseq)>0,bacteria_MP_phyloseq)

statistic_names <- c("Number of Samples", "Number of ASVs","Number of Reads","Phyla","Classes","Orders","Families","Genera","Species")
nsamples <- nsamples(bacteria_MP_phyloseq)
ntaxa <- ntaxa(bacteria_MP_phyloseq)
nreads <- sum(colSums(as.data.frame(otu_table(bacteria_MP_phyloseq))))
tax_table_all.df <- as.data.frame(as(tax_table(bacteria_MP_phyloseq),"matrix"))
# a blank value is considered unique, thus -1 for all levels of taxonomy
nphyla <- length(unique(tax_table_all.df$Phylum)) - 1
nclasses <- length(unique(tax_table_all.df$Class)) - 1
norders <- length(unique(tax_table_all.df$Order)) - 1
nfamilies <- length(unique(tax_table_all.df$Family)) - 1
ngenera <- length(unique(tax_table_all.df$Genus)) - 1
nspecies <- length(unique(tax_table_all.df$Species)) -1
statistic_values <- c(nsamples,ntaxa,nreads,nphyla,nclasses,norders,nfamilies,ngenera,nspecies)
overview_statistics_bacteria_MP <- data.frame(statistic_names,statistic_values)

overview_statistics_bacteria_MP
##     statistic_names statistic_values
## 1 Number of Samples                8
## 2    Number of ASVs             4139
## 3   Number of Reads          1501171
## 4             Phyla               38
## 5           Classes               72
## 6            Orders              166
## 7          Families              242
## 8            Genera              414
## 9           Species                8
# Calculate mean + SD
kosmos_bacteria_MP_sample_sums <- as.data.frame(colSums(as.data.frame(otu_table(bacteria_MP_phyloseq))))
kosmos_bacteria_MP_sample_sums <- rownames_to_column(kosmos_bacteria_MP_sample_sums,"sample")
colnames(kosmos_bacteria_MP_sample_sums)[2] <- "reads"

summarySE(kosmos_bacteria_MP_sample_sums,measurevar = "reads")
##    .id N    reads       sd       se       ci
## 1 <NA> 8 187646.4 36331.09 12844.98 30373.55

Join lists of phyla, genera, count number of unique phyla, genera

COI_phyla <- as.character(COI_phyla)
phyla_18S <- as.character(phyla_18S)
phyla_12S <- as.character(phyla_12S)
chloroplast_phyla <- as.character(chloroplast_phyla)
bacteria_phyla <- as.character(bacteria_phyla)

# don't count 12S for now, unless we decide to include them in the paper
all_phyla <- c(COI_phyla, phyla_12S, phyla_18S, bacteria_phyla, chloroplast_phyla)
length(unique(all_phyla)) -4
## [1] 86
# unknown, blank, no_hit, and unassigned are all in here, thus -4
sort(unique(all_phyla))
##  [1] ""                              "Acidobacteria"                
##  [3] "Actinobacteria"                "Annelida"                     
##  [5] "Apicomplexa"                   "Arthropoda"                   
##  [7] "Ascomycota"                    "Bacillariophyta"              
##  [9] "Bacteroidetes"                 "Basidiomycota"                
## [11] "Blastocladiomycota"            "Brachiopoda"                  
## [13] "Bryozoa"                       "Calditrichaeota"              
## [15] "Cercozoa"                      "Chaetognatha"                 
## [17] "Chloroflexi"                   "Chlorophyta"                  
## [19] "Chordata"                      "Chytridiomycota"              
## [21] "Ciliophora"                    "Cloacimonetes"                
## [23] "Cnidaria"                      "Cryptophyta"                  
## [25] "Ctenophora"                    "Cyanobacteria"                
## [27] "Dadabacteria"                  "Deferribacteres"              
## [29] "Deinococcus-Thermus"           "Dependentiae"                 
## [31] "Discoba"                       "Discosea"                     
## [33] "Echinodermata"                 "Epsilonbacteraeota"           
## [35] "Euglenida"                     "Euglenozoa"                   
## [37] "Fibrobacteres"                 "Firmicutes"                   
## [39] "Foraminifera"                  "Fusobacteria"                 
## [41] "Gastrotricha"                  "Gemmatimonadetes"             
## [43] "Haptista"                      "Haptophyta"                   
## [45] "Hemichordata"                  "Heterolobosea"                
## [47] "Hydrogenedentes"               "Imbricatea"                   
## [49] "Kiritimatiellaeota"            "Latescibacteria"              
## [51] "Lentisphaerae"                 "Margulisbacteria"             
## [53] "Marinimicrobia_(SAR406_clade)" "Microsporidia"                
## [55] "Mollusca"                      "Mucoromycota"                 
## [57] "Nematoda"                      "Nemertea"                     
## [59] "Nitrospinae"                   "no_hit"                       
## [61] "Ochrophyta"                    "Omnitrophicaeota"             
## [63] "Onychophora"                   "Patescibacteria"              
## [65] "Phoronida"                     "Picozoa"                      
## [67] "Placozoa"                      "Planctomycetes"               
## [69] "Platyhelminthes"               "Poribacteria"                 
## [71] "Porifera"                      "Proteobacteria"               
## [73] "Rappemonad"                    "Rhodophyta"                   
## [75] "Rokubacteria"                  "Rotifera"                     
## [77] "RsaHF231"                      "Spirochaetes"                 
## [79] "Streptophyta"                  "Synergistetes"                
## [81] "Tenericutes"                   "Thermotogae"                  
## [83] "Tubulinea"                     "unassigned"                   
## [85] "unknown"                       "Verrucomicrobia"              
## [87] "WOR-1"                         "WPS-2"                        
## [89] "Zixibacteria"                  "Zoopagomycota"
COI_genera <- as.character(COI_genera)
genera_18S <- as.character(genera_18S)
genera_12S <- as.character(genera_12S)
chloroplast_genera <- as.character(chloroplast_genera)
bacteria_genera <- as.character(bacteria_genera)

# don't count 12S for now, unless we decide to include them in the paper
all_genera <- c(COI_genera, genera_12S, genera_18S, bacteria_genera, chloroplast_genera)
# unknown, blank, no_hit, g_, and unassigned are all in here, thus -5

Field blank sample composition

# Control 1 = MilliQ, Control 2 = RO

kosmos_COI_field_blanks <- prune_samples(sample_names(kosmos_COI_phyloseq) %in% c("KOSMOSD3_MQCBa_X","KOSMOSD3_ROCBa_X"),kosmos_COI_phyloseq)
kosmos_COI_field_blanks <- prune_taxa(taxa_sums(kosmos_COI_field_blanks) > 0, kosmos_COI_field_blanks)
kosmos_18S_field_blanks <- prune_samples(sample_names(kosmos_18S_phyloseq) %in% c("KOSMOSD3_MQCBa_UU","KOSMOSD3_ROCBa_UU"),kosmos_18S_phyloseq)
kosmos_18S_field_blanks <- prune_taxa(taxa_sums(kosmos_18S_field_blanks) > 0, kosmos_18S_field_blanks)
kosmos_12S_field_blanks <- prune_samples(sample_names(kosmos_12S_phyloseq) %in% c("KOSMOSD3_MQCBa_TT","KOSMOSD3_ROCBa_TT"),kosmos_12S_phyloseq)
kosmos_12S_field_blanks <- prune_taxa(taxa_sums(kosmos_12S_field_blanks) > 0, kosmos_12S_field_blanks)
kosmos_bacteria_field_blanks <- prune_samples(sample_names(bacteria_phyloseq) %in% c("KOSMOS.D3.control1","KOSMOS.D3.control2"),bacteria_phyloseq)
kosmos_bacteria_field_blanks <- prune_taxa(taxa_sums(kosmos_bacteria_field_blanks) > 0, kosmos_bacteria_field_blanks)
kosmos_chloroplast_field_blanks <- prune_samples(sample_names(chloroplast_phyloseq) %in% c("KOSMOS.D3.control1","KOSMOS.D3.control2"),chloroplast_phyloseq)
kosmos_chloroplast_field_blanks <- prune_taxa(taxa_sums(kosmos_chloroplast_field_blanks) > 0, kosmos_chloroplast_field_blanks)

kosmos_COI_field_blanks_class <- tax_glom(kosmos_COI_field_blanks,taxrank=rank_names(kosmos_COI_field_blanks)[3], NArm = FALSE)
kosmos_18S_field_blanks_class <- tax_glom(kosmos_18S_field_blanks,taxrank=rank_names(kosmos_18S_field_blanks)[3], NArm = FALSE)
kosmos_12S_field_blanks_class <- tax_glom(kosmos_12S_field_blanks,taxrank=rank_names(kosmos_12S_field_blanks)[3], NArm = FALSE)
kosmos_bacteria_field_blanks_class <- tax_glom(kosmos_bacteria_field_blanks,taxrank=rank_names(kosmos_bacteria_field_blanks)[3], NArm = FALSE)
kosmos_chloroplast_field_blanks_class <- tax_glom(kosmos_chloroplast_field_blanks,taxrank=rank_names(kosmos_chloroplast_field_blanks)[3], NArm = FALSE)

# taxa_sums(kosmos_COI_field_blanks)
# taxa_sums(kosmos_18S_field_blanks)
# taxa_sums(kosmos_12S_field_blanks)
# taxa_sums(kosmos_bacteria_field_blanks_class)
# taxa_sums(kosmos_chloroplast_field_blanks_class)

# Create a table that documents ASVs found in field blanks and their abundance vs their abundance in the environmental samples

COI: Create tables that document ASVs found in field blanks and their abundance in field blanks vs. in the environmental samples

kosmos_COI_field_blanks_tax_table <- as.data.frame(as(tax_table(kosmos_COI_field_blanks),"matrix"))
kosmos_COI_field_blanks_tax_table <- rownames_to_column(kosmos_COI_field_blanks_tax_table,"ASV")
kosmos_COI_field_blanks_tax_table %>% mutate_all(as.character) -> kosmos_COI_field_blanks_tax_table

kosmos_COI_field_blanks_asv_table <- as.data.frame(otu_table(kosmos_COI_field_blanks))
kosmos_COI_field_blanks_asv_table <- rownames_to_column(kosmos_COI_field_blanks_asv_table, "ASV")
# Calculate the average proportion of each ASV in the field blanks
kosmos_COI_field_blanks_asv_table %>% mutate(., field_blank_mean_perc = (KOSMOSD3_MQCBa_X + KOSMOSD3_ROCBa_X)/(sum(kosmos_COI_field_blanks_asv_table$KOSMOSD3_MQCBa_X) + sum(kosmos_COI_field_blanks_asv_table$KOSMOSD3_ROCBa_X))) -> kosmos_COI_field_blanks_asv_table

# Calculate the average proportion of each ASV in the environmental samples
kosmos_COI_16_asv_table <- as.data.frame(otu_table(kosmos_COI_16_phyloseq))
kosmos_COI_16_asv_table <- rownames_to_column(kosmos_COI_16_asv_table, "ASV")
kosmos_COI_16_asv_table %>% mutate(., envt_mean_perc = (rowSums(.[,2:17]))/(sum(colSums(.[,2:17])))) -> kosmos_COI_16_asv_table

kosmos_COI_envt_means <- kosmos_COI_16_asv_table[,c("ASV","envt_mean_perc")]

kosmos_COI_field_blanks_v_envt <- left_join(kosmos_COI_field_blanks_asv_table,kosmos_COI_envt_means, by = "ASV")

kosmos_COI_field_blanks_v_envt <- kosmos_COI_field_blanks_v_envt[order(-kosmos_COI_field_blanks_v_envt$field_blank_mean_perc),]

kosmos_COI_field_blanks_v_envt <- left_join(kosmos_COI_field_blanks_v_envt, kosmos_COI_field_blanks_tax_table, by = "ASV")

write.csv(kosmos_COI_field_blanks_v_envt, here::here("figures", "field_blanks", "COI_field_blank_composition.csv"))

18S: Create tables that document ASVs found in field blanks and their abundance in field blanks vs. in the environmental samples

kosmos_18S_field_blanks_tax_table <- as.data.frame(as(tax_table(kosmos_18S_field_blanks),"matrix"))
kosmos_18S_field_blanks_tax_table <- rownames_to_column(kosmos_18S_field_blanks_tax_table,"ASV")
kosmos_18S_field_blanks_tax_table %>% mutate_all(as.character) -> kosmos_18S_field_blanks_tax_table

kosmos_18S_field_blanks_asv_table <- as.data.frame(otu_table(kosmos_18S_field_blanks))
kosmos_18S_field_blanks_asv_table <- rownames_to_column(kosmos_18S_field_blanks_asv_table, "ASV")
# Calculate the average proportion of each ASV in the field blanks
kosmos_18S_field_blanks_asv_table %>% mutate(., field_blank_mean_perc = (KOSMOSD3_MQCBa_UU + KOSMOSD3_ROCBa_UU)/(sum(kosmos_18S_field_blanks_asv_table$KOSMOSD3_MQCBa_UU) + sum(kosmos_18S_field_blanks_asv_table$KOSMOSD3_ROCBa_UU))) -> kosmos_18S_field_blanks_asv_table

# Calculate the average proportion of each ASV in the environmental samples
kosmos_18S_16_asv_table <- as.data.frame(otu_table(kosmos_18S_16_phyloseq))
kosmos_18S_16_asv_table <- rownames_to_column(kosmos_18S_16_asv_table, "ASV")
kosmos_18S_16_asv_table %>% mutate(., envt_mean_perc = (rowSums(.[,2:17]))/(sum(colSums(.[,2:17])))) -> kosmos_18S_16_asv_table

kosmos_18S_envt_means <- kosmos_18S_16_asv_table[,c("ASV","envt_mean_perc")]

kosmos_18S_field_blanks_v_envt <- left_join(kosmos_18S_field_blanks_asv_table,kosmos_18S_envt_means, by = "ASV")

kosmos_18S_field_blanks_v_envt <- kosmos_18S_field_blanks_v_envt[order(-kosmos_18S_field_blanks_v_envt$field_blank_mean_perc),]

kosmos_18S_field_blanks_v_envt <- left_join(kosmos_18S_field_blanks_v_envt, kosmos_18S_field_blanks_tax_table, by = "ASV")

write.csv(kosmos_18S_field_blanks_v_envt, here::here("figures", "field_blanks", "18S_field_blank_composition.csv"))

12S: Create tables that document ASVs found in field blanks and their abundance in field blanks vs. in the environmental samples

kosmos_12S_field_blanks_tax_table <- as.data.frame(as(tax_table(kosmos_12S_field_blanks),"matrix"))
kosmos_12S_field_blanks_tax_table <- rownames_to_column(kosmos_12S_field_blanks_tax_table,"ASV")
kosmos_12S_field_blanks_tax_table %>% mutate_all(as.character) -> kosmos_12S_field_blanks_tax_table

kosmos_12S_field_blanks_asv_table <- as.data.frame(otu_table(kosmos_12S_field_blanks))
kosmos_12S_field_blanks_asv_table <- rownames_to_column(kosmos_12S_field_blanks_asv_table, "ASV")
# Calculate the average proportion of each ASV in the field blanks
kosmos_12S_field_blanks_asv_table %>% mutate(., field_blank_mean_perc = (KOSMOSD3_MQCBa_TT + KOSMOSD3_ROCBa_TT)/(sum(kosmos_12S_field_blanks_asv_table$KOSMOSD3_MQCBa_TT) + sum(kosmos_12S_field_blanks_asv_table$KOSMOSD3_ROCBa_TT))) -> kosmos_12S_field_blanks_asv_table

# Calculate the average proportion of each ASV in the environmental samples
kosmos_12S_16_asv_table <- as.data.frame(otu_table(kosmos_12S_16_phyloseq))
kosmos_12S_16_asv_table <- rownames_to_column(kosmos_12S_16_asv_table, "ASV")
kosmos_12S_16_asv_table %>% mutate(., envt_mean_perc = (rowSums(.[,2:17]))/(sum(colSums(.[,2:17])))) -> kosmos_12S_16_asv_table

kosmos_12S_envt_means <- kosmos_12S_16_asv_table[,c("ASV","envt_mean_perc")]

kosmos_12S_field_blanks_v_envt <- left_join(kosmos_12S_field_blanks_asv_table,kosmos_12S_envt_means, by = "ASV")

kosmos_12S_field_blanks_v_envt <- kosmos_12S_field_blanks_v_envt[order(-kosmos_12S_field_blanks_v_envt$field_blank_mean_perc),]

kosmos_12S_field_blanks_v_envt <- left_join(kosmos_12S_field_blanks_v_envt, kosmos_12S_field_blanks_tax_table, by = "ASV")

write.csv(kosmos_12S_field_blanks_v_envt, here::here("figures", "field_blanks", "12S_field_blank_composition.csv"))

Bacteria: Create tables that document ASVs found in field blanks and their abundance in field blanks vs. in the environmental samples

kosmos_bacteria_field_blanks_tax_table <- as.data.frame(as(tax_table(kosmos_bacteria_field_blanks),"matrix"))
kosmos_bacteria_field_blanks_tax_table <- rownames_to_column(kosmos_bacteria_field_blanks_tax_table,"ASV")
kosmos_bacteria_field_blanks_tax_table %>% mutate_all(as.character) -> kosmos_bacteria_field_blanks_tax_table

kosmos_bacteria_field_blanks_asv_table <- as.data.frame(otu_table(kosmos_bacteria_field_blanks))
kosmos_bacteria_field_blanks_asv_table <- rownames_to_column(kosmos_bacteria_field_blanks_asv_table, "ASV")
# Calculate the average proportion of each ASV in the field blanks
kosmos_bacteria_field_blanks_asv_table %>% mutate(., field_blank_mean_perc = (KOSMOS.D3.control1 + KOSMOS.D3.control2)/(sum(kosmos_bacteria_field_blanks_asv_table$KOSMOS.D3.control1) + sum(kosmos_bacteria_field_blanks_asv_table$KOSMOS.D3.control2))) -> kosmos_bacteria_field_blanks_asv_table

# Calculate the average proportion of each ASV in the environmental samples
kosmos_bacteria_16_asv_table <- as.data.frame(otu_table(bacteria_16_phyloseq))
kosmos_bacteria_16_asv_table <- rownames_to_column(kosmos_bacteria_16_asv_table, "ASV")
kosmos_bacteria_16_asv_table %>% mutate(., envt_mean_perc = (rowSums(.[,2:17]))/(sum(colSums(.[,2:17])))) -> kosmos_bacteria_16_asv_table

kosmos_bacteria_envt_means <- kosmos_bacteria_16_asv_table[,c("ASV","envt_mean_perc")]

kosmos_bacteria_field_blanks_v_envt <- left_join(kosmos_bacteria_field_blanks_asv_table,kosmos_bacteria_envt_means, by = "ASV")

kosmos_bacteria_field_blanks_v_envt <- kosmos_bacteria_field_blanks_v_envt[order(-kosmos_bacteria_field_blanks_v_envt$field_blank_mean_perc),]

kosmos_bacteria_field_blanks_v_envt <- left_join(kosmos_bacteria_field_blanks_v_envt, kosmos_bacteria_field_blanks_tax_table, by = "ASV")

write.csv(kosmos_bacteria_field_blanks_v_envt, here::here("figures", "field_blanks", "bacteria_field_blank_composition.csv"))

Chloroplasts: Create tables that document ASVs found in field blanks and their abundance in field blanks vs. in the environmental samples

kosmos_chloroplast_field_blanks_tax_table <- as.data.frame(as(tax_table(kosmos_chloroplast_field_blanks),"matrix"))
kosmos_chloroplast_field_blanks_tax_table <- rownames_to_column(kosmos_chloroplast_field_blanks_tax_table,"ASV")
kosmos_chloroplast_field_blanks_tax_table %>% mutate_all(as.character) -> kosmos_chloroplast_field_blanks_tax_table

kosmos_chloroplast_field_blanks_asv_table <- as.data.frame(otu_table(kosmos_chloroplast_field_blanks))
kosmos_chloroplast_field_blanks_asv_table <- rownames_to_column(kosmos_chloroplast_field_blanks_asv_table, "ASV")
# Calculate the average proportion of each ASV in the field blanks
kosmos_chloroplast_field_blanks_asv_table %>% mutate(., field_blank_mean_perc = (KOSMOS.D3.control1 + KOSMOS.D3.control2)/(sum(kosmos_chloroplast_field_blanks_asv_table$KOSMOS.D3.control1) + sum(kosmos_chloroplast_field_blanks_asv_table$KOSMOS.D3.control2))) -> kosmos_chloroplast_field_blanks_asv_table

# Calculate the average proportion of each ASV in the environmental samples
kosmos_chloroplast_16_asv_table <- as.data.frame(otu_table(chloroplast_16_phyloseq))
kosmos_chloroplast_16_asv_table <- rownames_to_column(kosmos_chloroplast_16_asv_table, "ASV")
kosmos_chloroplast_16_asv_table %>% mutate(., envt_mean_perc = (rowSums(.[,2:17]))/(sum(colSums(.[,2:17])))) -> kosmos_chloroplast_16_asv_table

kosmos_chloroplast_envt_means <- kosmos_chloroplast_16_asv_table[,c("ASV","envt_mean_perc")]

kosmos_chloroplast_field_blanks_v_envt <- left_join(kosmos_chloroplast_field_blanks_asv_table,kosmos_chloroplast_envt_means, by = "ASV")

kosmos_chloroplast_field_blanks_v_envt <- kosmos_chloroplast_field_blanks_v_envt[order(-kosmos_chloroplast_field_blanks_v_envt$field_blank_mean_perc),]

kosmos_chloroplast_field_blanks_v_envt <- left_join(kosmos_chloroplast_field_blanks_v_envt, kosmos_chloroplast_field_blanks_tax_table, by = "ASV")

write.csv(kosmos_chloroplast_field_blanks_v_envt, here::here("figures", "field_blanks", "chloroplast_field_blank_composition.csv"))

PCA (Figure 3)

Note When running the DEICODE PCA, we are manually flipping the sign of our ordination results so that they all align nicely for the plot.

Make one legend for PCA plots

legend_df <- data.frame(c("Pacific","Mesocosm 1 (M1)"),c(1,1),c(0.78,0.975))
colnames(legend_df) <- c("station","position_x","position_y")
legend_gg <- ggplot(data = legend_df, aes(x = position_x, y = position_y, color = station, shape = station))+
  geom_point(size = 14)+
  geom_text(data = subset(legend_df,station == "Pacific"), aes(x = position_x+0.2, y = position_y+0.0175, color = station,label = station),hjust = 0, size = 18)+
    geom_text(data = subset(legend_df,station == "Mesocosm 1 (M1)"), aes(x = position_x+0.2, y = position_y+0.01, color = station,label = station),hjust = 0, size = 18)+
  scale_color_manual(name = "station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
    theme(panel.background = element_rect(fill="white"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(), 
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 0, "cm"),)+
  ylim(0.7,1.1)+
  xlim(0.9,3.5)
    
  # annotate("text",label = "Mesocosm 1 (M1)", x = 31.5, y = 20.55,size = 6,adj = 0)+ #mesocosm
  # annotate("text",label = "Pacific", x = 31.5, y = 20.35, size = 6, adj = 0)+ #pacificß

legend_gg

ggsave(here::here("figures", "PCA_figure", "PCA_legend_v2.png"),legend_gg, width = 7, height = 2)

COI

COI: Export ASV table,tax_table, and metadata for deicode - only M1 and Pacific samples

kosmos_COI_16_asv_table <- as.data.frame(otu_table(kosmos_COI_16_phyloseq))
kosmos_COI_16_asv_table <- rownames_to_column(kosmos_COI_16_asv_table,"#OTUID")
write.table(kosmos_COI_16_asv_table,here::here("local_deicode_runs", "COI", "kosmos_COI_16_asv_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_COI_16_tax_table <- as.data.frame(as(tax_table(kosmos_COI_16_phyloseq),"matrix"))
kosmos_COI_16_tax_table <- rownames_to_column(kosmos_COI_16_tax_table,"#OTUID")
write.table(kosmos_COI_16_tax_table,here::here("local_deicode_runs", "COI", "kosmos_COI_16_tax_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_COI_16samples_metadata <- as.data.frame(as.matrix(sample_data(kosmos_COI_16_phyloseq)))
kosmos_COI_16samples_metadata <- rownames_to_column(kosmos_COI_16samples_metadata,"#SampleID")
kosmos_COI_16samples_metadata <- kosmos_COI_16samples_metadata[,!(names(kosmos_COI_16samples_metadata) %in% c("sample_name"))]
# Add a sample name column
sample <- c("M15","P15","M1","P1","M24","P24","M32","P32","M36","P36","M42","P42","M48","P48","M8","P8")
kosmos_COI_16samples_metadata["sample"] <- sample
# Add a day of experiment column
day_of_experiment <- c(15,15,1,1,24,24,32,32,36,36,42,42,48,48,8,8)
kosmos_COI_16samples_metadata["day_of_experiment"] <- day_of_experiment
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_COI_16samples_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_COI_16samples_metadata
# kosmos_COI_16samples_metadata %>% mutate(.,SAMPLING_day = regmatches(kosmos_COI_16samples_metadata$sample, gregexpr("[[:digit:]]+", kosmos_COI_16samples_metadata$sample))) -> kosmos_COI_16samples_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_COI_16samples_metadata,here::here("local_deicode_runs", "COI", "kosmos_COI_16_sample_data_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

COI: DEICODE RPCA, 3 PCs

pco <- read_qza(here::here("local_deicode_runs", "COI", "ordination.qza"))

label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (56.2%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (42.9%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (0.9%)"
## Prepare PCA data for ggplot

pca_metadata <- kosmos_COI_16samples_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors

# Manually change sign of PC1
pca_data$PC1 <- pca_data$PC1*-1

#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot

pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = day_of_experiment))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("P32","P15","P8","P1","M24","M32")),aes(x=PC1+0.007,y=PC2+0.05,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
    # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c("P42","M1","M48","M42","M36")),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
      # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("M8","P48")),aes(x=PC1+0.009,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
      # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("P24","M15","P36")),aes(x=PC1-0.007,y=PC2+0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  annotate("text",  x=Inf, y = Inf, label = "(a)", vjust=1.5, hjust=1.3,size = 12)+
  xlim(NA,max(pca_data$PC1)+0.03)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        legend.position = "none",
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5))+
  ggtitle("COI")
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (56.2%)"
## [1] "PC2 (42.9%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "COI_pca_kosmos_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 6)

##### PERMANOVA #####
# Extract metadata
pca_metadata <- pca_data
# pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata

# Create a new column to compare M1 post OMZ water addition with other samples
pca_metadata %>% 
  mutate(permanova_group = ifelse(SAMPLING_station == "Pacific" | SAMPLING_station == "Mesocosm 1" & sample == "M1" | SAMPLING_station == "Mesocosm 1" & sample == "M8",
                                  "Pacific & M1 pre-OMZ", "M1 post-OMZ")) -> pca_metadata

## Load COI Data
distance <- read_qza(here::here("local_deicode_runs", "COI", "distance.qza"))

# Extract distance matrix
distance_matrix <- distance$data

# convert DEICODE matrix to "dist" class object
PCA_dist <- as.dist(distance_matrix)
##----Run individual PERMANOVAs-------------------------------------------------------------

# PERMANOVA for CTD vs. ESP
set.seed(123)
M1_v_Pacific_permanova_COI <- adonis2(PCA_dist ~ permanova_group, data = pca_metadata, permutations=999)
M1_v_Pacific_permanova_COI
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = PCA_dist ~ permanova_group, data = pca_metadata, permutations = 999)
##                 Df SumOfSqs      R2      F Pr(>F)    
## permanova_group  1   14.769 0.45689 11.778  0.001 ***
## Residual        14   17.556 0.54311                  
## Total           15   32.325 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

18S

18S: Export ASV table,tax_table, and metadata for deicode - only M1 and Pacific samples

kosmos_18S_16_asv_table <- as.data.frame(otu_table(kosmos_18S_16_phyloseq))
kosmos_18S_16_asv_table <- rownames_to_column(kosmos_18S_16_asv_table,"#OTUID")
write.table(kosmos_18S_16_asv_table,here::here("local_deicode_runs", "18S", "kosmos_18S_16_asv_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_18S_16_tax_table <- as.data.frame(as(tax_table(kosmos_18S_16_phyloseq),"matrix"))
kosmos_18S_16_tax_table <- rownames_to_column(kosmos_18S_16_tax_table,"#OTUID")
write.table(kosmos_18S_16_tax_table,here::here("local_deicode_runs", "18S", "kosmos_18S_16_tax_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_18S_16samples_metadata <- as.data.frame(as.matrix(sample_data(kosmos_18S_16_phyloseq)))
kosmos_18S_16samples_metadata <- rownames_to_column(kosmos_18S_16samples_metadata,"#SampleID")
kosmos_18S_16samples_metadata <- kosmos_18S_16samples_metadata[,!(names(kosmos_18S_16samples_metadata) %in% c("sample_name"))]
# Add a sample name column
sample <- c("M15","P15","M1","P1","M24","P24","M32","P32","M36","P36","M42","P42","M48","P48","M8","P8")
kosmos_18S_16samples_metadata["sample"] <- sample
# Add a day of experiment column
day_of_experiment <- c(15,15,1,1,24,24,32,32,36,36,42,42,48,48,8,8)
kosmos_18S_16samples_metadata["day_of_experiment"] <- day_of_experiment
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_18S_16samples_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_18S_16samples_metadata
# kosmos_18S_16samples_metadata %>% mutate(.,SAMPLING_day = regmatches(kosmos_18S_16samples_metadata$sample, gregexpr("[[:digit:]]+", kosmos_18S_16samples_metadata$sample))) -> kosmos_18S_16samples_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_18S_16samples_metadata,here::here("local_deicode_runs", "18S", "kosmos_18S_16_sample_data_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

18S: DEICODE RPCA, 3 PCs

pco <- read_qza(here::here("local_deicode_runs", "18S", "ordination.qza"))

label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (60.8%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (37.3%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (2%)"
## Prepare PCA data for ggplot

pca_metadata <- kosmos_18S_16samples_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors

# Manually change signs of PC1 and PC2
pca_data$PC1 <- pca_data$PC1*-1
pca_data$PC2 <- pca_data$PC2*-1

#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot

pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = day_of_experiment))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("M48","P24", "M8", "P32","P8")),aes(x=PC1+0.007,y=PC2+0.05,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c()),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("M1","P1","P15","P48","P36","M36","M32")),aes(x=PC1+0.009,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("M42","P42","M15","M24")),aes(x=PC1-0.007,y=PC2+0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  annotate("text",  x=Inf, y = Inf, label = "(b)", vjust=1.5, hjust=1.3,size = 12)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        legend.position = "none",
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5))+
  ggtitle("18S rRNA")+
  xlim(NA,max(pca_data$PC1)+0.03)
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (60.8%)"
## [1] "PC2 (37.3%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "18S_pca_kosmos_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 6)

##### PERMANOVA #####
# Extract metadata
pca_metadata <- pca_data
# pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata

# Create a new column to compare M1 post OMZ water addition with other samples
pca_metadata %>% 
  mutate(permanova_group = ifelse(SAMPLING_station == "Pacific" | SAMPLING_station == "Mesocosm 1" & sample == "M1" | SAMPLING_station == "Mesocosm 1" & sample == "M8",
                                  "Pacific & M1 pre-OMZ", "M1 post-OMZ")) -> pca_metadata

## Load COI Data
distance <- read_qza(here::here("local_deicode_runs", "18S", "distance.qza"))

# Extract distance matrix
distance_matrix <- distance$data

# convert DEICODE matrix to "dist" class object
PCA_dist <- as.dist(distance_matrix)
##----Run individual PERMANOVAs-------------------------------------------------------------

# PERMANOVA for CTD vs. ESP
set.seed(123)
M1_v_Pacific_permanova_18S <- adonis2(PCA_dist ~ permanova_group, data = pca_metadata, permutations=999)
M1_v_Pacific_permanova_18S
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = PCA_dist ~ permanova_group, data = pca_metadata, permutations = 999)
##                 Df SumOfSqs      R2      F Pr(>F)    
## permanova_group  1   14.766 0.45661 11.764  0.001 ***
## Residual        14   17.573 0.54339                  
## Total           15   32.339 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

12S

12S: Export ASV table,tax_table, and metadata for deicode - only M1 and Pacific samples

kosmos_12S_16_asv_table <- as.data.frame(otu_table(kosmos_12S_16_phyloseq))
kosmos_12S_16_asv_table <- rownames_to_column(kosmos_12S_16_asv_table,"#OTUID")
write.table(kosmos_12S_16_asv_table,here::here("local_deicode_runs", "12S", "kosmos_12S_16_asv_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_12S_16_tax_table <- as.data.frame(as(tax_table(kosmos_12S_16_phyloseq),"matrix"))
kosmos_12S_16_tax_table <- rownames_to_column(kosmos_12S_16_tax_table,"#OTUID")
write.table(kosmos_12S_16_tax_table,here::here("local_deicode_runs", "12S", "kosmos_12S_16_tax_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_12S_16samples_metadata <- as.data.frame(as.matrix(sample_data(kosmos_12S_16_phyloseq)))
kosmos_12S_16samples_metadata <- rownames_to_column(kosmos_12S_16samples_metadata,"#SampleID")
kosmos_12S_16samples_metadata <- kosmos_12S_16samples_metadata[,!(names(kosmos_12S_16samples_metadata) %in% c("sample_name"))]
# Add a sample name column
sample <- c("M15","P15","M1","P1","M24","P24","M32","P32","M36","P36","M42","P42","M48","P48","M8","P8")
kosmos_12S_16samples_metadata["sample"] <- sample
# Add a day of experiment column
day_of_experiment <- c(15,15,1,1,24,24,32,32,36,36,42,42,48,48,8,8)
kosmos_12S_16samples_metadata["day_of_experiment"] <- day_of_experiment
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_12S_16samples_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_12S_16samples_metadata
# kosmos_12S_16samples_metadata %>% mutate(.,SAMPLING_day = regmatches(kosmos_12S_16samples_metadata$sample, gregexpr("[[:digit:]]+", kosmos_12S_16samples_metadata$sample))) -> kosmos_12S_16samples_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_12S_16samples_metadata,here::here("local_deicode_runs", "12S", "kosmos_12S_16_sample_data_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

12S: DEICODE RPCA, 3 PCs

pco <- read_qza(here::here("local_deicode_runs", "12S", "ordination.qza"))

label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (59.2%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (31.6%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (9.3%)"
## Prepare PCA data for ggplot

pca_metadata <- kosmos_12S_16samples_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors
#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot

pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = day_of_experiment))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("M48","P24", "M8", "P32","P8","P42")),aes(x=PC1+0.007,y=PC2+0.05,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c("P48")),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("M1","P15","P36","M36","M32","M15")),aes(x=PC1+0.009,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("M42","M24","P1")),aes(x=PC1-0.007,y=PC2+0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  annotate("text",  x=Inf, y = Inf, label = "(b)", vjust=1.5, hjust=1.3,size = 12)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5),
        legend.position = "none")+
  ggtitle("12S")+
  xlim(NA,max(pca_data$PC1)+0.03)
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (59.2%)"
## [1] "PC2 (31.6%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "12S_pca_kosmos_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 5)

12S, chordates only

kosmos_12S_chordata_16_phyloseq <- subset_taxa(kosmos_12S_16_phyloseq, Phylum == "Chordata")

kosmos_12S_chordata_16_asv_table <- as.data.frame(otu_table(kosmos_12S_chordata_16_phyloseq))
kosmos_12S_chordata_16_asv_table <- rownames_to_column(kosmos_12S_chordata_16_asv_table,"#OTUID")
write.table(kosmos_12S_chordata_16_asv_table,here::here("local_deicode_runs", "12S", "chordata", "kosmos_12S_chordata_16_asv_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_12S_chordata_16_tax_table <- as.data.frame(as(tax_table(kosmos_12S_chordata_16_phyloseq),"matrix"))
kosmos_12S_chordata_16_tax_table <- rownames_to_column(kosmos_12S_chordata_16_tax_table,"#OTUID")
write.table(kosmos_12S_chordata_16_tax_table,here::here("local_deicode_runs", "12S", "chordata", "kosmos_12S_chordata_16_tax_table_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_12S_chordata_16samples_metadata <- as.data.frame(as.matrix(sample_data(kosmos_12S_chordata_16_phyloseq)))
kosmos_12S_chordata_16samples_metadata <- rownames_to_column(kosmos_12S_chordata_16samples_metadata,"#SampleID")
kosmos_12S_chordata_16samples_metadata <- kosmos_12S_chordata_16samples_metadata[,!(names(kosmos_12S_chordata_16samples_metadata) %in% c("sample_name"))]
# Add a sample name column
sample <- c("M1","P1","M15","P15","M24","P24","M32","P32","M36","P36","M42","P42","M48","P48","M8","P8")
kosmos_12S_chordata_16samples_metadata["sample"] <- sample
# Add a day of experiment column
day_of_experiment <- c(1,1,15,15,24,24,32,32,36,36,42,42,48,48,8,8)
kosmos_12S_chordata_16samples_metadata["day_of_experiment"] <- day_of_experiment
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_12S_chordata_16samples_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_12S_chordata_16samples_metadata
# kosmos_12S_chordata_16samples_metadata %>% mutate(.,SAMPLING_day = regmatches(kosmos_12S_chordata_16samples_metadata$sample, gregexpr("[[:digit:]]+", kosmos_12S_chordata_16samples_metadata$sample))) -> kosmos_12S_chordata_16samples_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_12S_chordata_16samples_metadata,here::here("local_deicode_runs", "12S", "chordata", "kosmos_12S_chordata_16_sample_data_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

12S, chordates only: DEICODE RPCA, 3 PCs

pco <- read_qza(here::here("local_deicode_runs", "12S", "chordata", "ordination.qza"))

label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (57.5%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (39.4%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (3.2%)"
## Prepare PCA data for ggplot

pca_metadata <- kosmos_12S_16samples_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors
#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot

pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = day_of_experiment))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("M48","P24", "M8", "P8","P42")),aes(x=PC1+0.007,y=PC2+0.05,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c("P1","P48")),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("M1","P36","M36","M32","M15")),aes(x=PC1+0.009,y=PC2-0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
  # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("M42","M24","P32","P15")),aes(x=PC1-0.007,y=PC2+0.04,color = SAMPLING_station_number,label = day_of_experiment),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  # annotate("text",  x=Inf, y = Inf, label = "(b)", vjust=1.5, hjust=1.3,size = 12)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        legend.position = "none",
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5))+
  xlim(min(pca_data$PC1)-0.03,max(pca_data$PC1)+0.03)+
  ggtitle("12S")
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (57.5%)"
## [1] "PC2 (39.4%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "12S_pca_kosmos_chordata_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 6)

16S rRNA: chloroplast sequences

16S chloroplast: Export ASV table,tax_table, and metadata for deicode - only M1 and Pacific samples

chloroplast_16_asv_table <- as.data.frame(otu_table(chloroplast_16_phyloseq))
chloroplast_16_asv_table <- rownames_to_column(chloroplast_16_asv_table,"#OTUID")
write.table(chloroplast_16_asv_table,here::here("local_deicode_runs", "16S_plastidial", "kosmos_chloroplast_asv_table_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
chloroplast_16_tax_table<- as.data.frame(as(tax_table(chloroplast_16_phyloseq),"matrix"))
chloroplast_16_tax_table <- rownames_to_column(chloroplast_16_tax_table,"#OTUID")
write.table(chloroplast_16_tax_table,here::here("local_deicode_runs", "16S_plastidial", "kosmos_chloroplast_tax_table_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_16S_16_metadata <- as.data.frame(as.matrix(sample_data(chloroplast_16_phyloseq)))
kosmos_16S_16_metadata <- rownames_to_column(kosmos_16S_16_metadata,"#SampleID")
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_16S_16_metadata$sample <- as.character(kosmos_16S_16_metadata$sample)
kosmos_16S_16_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_16S_16_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_16S_16_metadata,here::here("local_deicode_runs", "16S_plastidial", "KOSMOS_sample_data_16S_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

16S chloroplast: DEICODE RPCA, 3 PCs

pco<-read_qza(here::here("local_deicode_runs", "16S_plastidial", "ordination.qza"))


label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (55.2%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (37.4%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (7.3%)"
## Prepare PCA data for ggplot
pca_metadata <- kosmos_16S_16_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors
#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot

pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = SAMPLING_day))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("P42","P32","P15","P8","P1","M24","M36")),aes(x=PC1+0.007,y=PC2+0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
    # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c("M1","M48","M42","M32")),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
      # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("M8","P48")),aes(x=PC1+0.007,y=PC2-0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
      # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("P24","M15","P36")),aes(x=PC1-0.007,y=PC2+0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  annotate("text",  x=Inf, y = Inf, label = "(c)", vjust=1.5, hjust=1.3,size = 12)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        legend.position = "none",
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5))+
  xlim(min(pca_data$PC1)-0.03,max(pca_data$PC1)+0.03)+
  ggtitle("16S Plastidial")
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (55.2%)"
## [1] "PC2 (37.4%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "16S_plastidial_pca_kosmos_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 6)

##### PERMANOVA #####
# Extract metadata
pca_metadata <- pca_data
# pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata

# Create a new column to compare M1 post OMZ water addition with other samples
pca_metadata %>% 
  mutate(permanova_group = ifelse(SAMPLING_station == "Pacific" | SAMPLING_station == "Mesocosm 1" & sample == "M1" | SAMPLING_station == "Mesocosm 1" & sample == "M8",
                                  "Pacific & M1 pre-OMZ", "M1 post-OMZ")) -> pca_metadata

## Load COI Data
distance <- read_qza(here::here("local_deicode_runs", "16S_plastidial", "distance.qza"))

# Extract distance matrix
distance_matrix <- distance$data

# convert DEICODE matrix to "dist" class object
PCA_dist <- as.dist(distance_matrix)
##----Run individual PERMANOVAs-------------------------------------------------------------

# PERMANOVA for CTD vs. ESP
set.seed(123)
M1_v_Pacific_permanova_16S_plastidial <- adonis2(PCA_dist ~ permanova_group, data = pca_metadata, permutations=999)
M1_v_Pacific_permanova_16S_plastidial
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = PCA_dist ~ permanova_group, data = pca_metadata, permutations = 999)
##                 Df SumOfSqs      R2      F Pr(>F)   
## permanova_group  1   11.506 0.33991 7.2092  0.002 **
## Residual        14   22.345 0.66009                 
## Total           15   33.851 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

16S rRNA: bacterial sequences

16S bacteria: Export ASV table,tax_table, and metadata for deicode - only M1 and Pacific samples

bacteria_16_asv_table <- as.data.frame(otu_table(bacteria_16_phyloseq))
bacteria_16_asv_table <- rownames_to_column(bacteria_16_asv_table,"#OTUID")
write.table(bacteria_16_asv_table,here::here("local_deicode_runs", "16S_bacterial", "kosmos_bacteria_asv_table_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
bacteria_16_tax_table<- as.data.frame(as(tax_table(bacteria_16_phyloseq),"matrix"))
bacteria_16_tax_table <- rownames_to_column(bacteria_16_tax_table,"#OTUID")
write.table(bacteria_16_tax_table,here::here("local_deicode_runs", "16S_bacterial", "kosmos_bacteria_tax_table_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)
kosmos_16S_16_metadata <- as.data.frame(as.matrix(sample_data(bacteria_16_phyloseq)))
kosmos_16S_16_metadata <- rownames_to_column(kosmos_16S_16_metadata,"#SampleID")
# Add a column for group (M1 after OMZ water addition, or Pacific + first two M1 samples)
kosmos_16S_16_metadata$sample <- as.character(kosmos_16S_16_metadata$sample)
kosmos_16S_16_metadata %>% mutate(.,group = ifelse(sample %in% c("M15","M24","M32","M36","M42","M48"),"M1_postOMZ","P_all_M1_preOMZ")) -> kosmos_16S_16_metadata
# metadata_12S <- metadata_12S %>%
#   dplyr::select("X.SampleID", everything())
# metadata_12S %>% dplyr::rename("#SampleID" = X.SampleID) -> metadata_12S
write.table(kosmos_16S_16_metadata,here::here("local_deicode_runs", "16S_bacterial", "KOSMOS_sample_data_bacteria_16samples_for_biom.txt"),sep = "\t",row.names = FALSE,quote = FALSE)

16S bacteria: DEICODE RPCA, 3 PCs

pco<-read_qza(here::here("local_deicode_runs", "16S_bacterial", "ordination.qza"))

label.PC1 <- paste0("PC1 (", round(pco$data$ProportionExplained$PC1, 3)*100,"%)")
label.PC1
## [1] "PC1 (62%)"
label.PC2 <- paste0("PC2 (", round(pco$data$ProportionExplained$PC2, 3)*100,"%)")
label.PC2
## [1] "PC2 (37.2%)"
label.PC3 <- paste0("PC3 (", round(pco$data$ProportionExplained$PC3, 3)*100,"%)")
label.PC3
## [1] "PC3 (0.9%)"
## Prepare PCA data for ggplot

pca_metadata <- kosmos_16S_metadata
pca_metadata %>% rownames_to_column(.,"#SampleID") -> pca_metadata
pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata
#pca_metadata$depth_cat <- factor(pca_metadata$depth_cat,levels = c("0-5","6-20","20-29","30-39","40-59","60-80","100-199","200-250","280-500","600-700"))
pca_data <- pco$data$Vectors
#pca_data <- left_join(pca_data,pca_metadata,on = "SampleID")
pca_data <- right_join(pca_data,pca_metadata,on = "SampleID")
## Joining, by = "SampleID"
pca_data <- subset(pca_data, !(is.na(pca_data$PC1)))
## Plot PCA in ggplot


pca_gg_sample <- ggplot(pca_data,aes(x=PC1,y=PC2,color = SAMPLING_station_number,shape = SAMPLING_station_number, label = SAMPLING_day))+
  geom_point(size = 6)+
  # Subset some samples to be above right
  geom_text(data = subset(pca_data,sample %in% c("M8","P42","P15","P1","M36")),aes(x=PC1+0.007,y=PC2+0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
    # Subset some samples to be below left
  geom_text(data = subset(pca_data,sample %in% c("M1","M42","P8","M24")),aes(x=PC1-0.007,y=PC2-0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
      # Subset some samples to be below right
  geom_text(data = subset(pca_data,sample %in% c("P48")),aes(x=PC1+0.007,y=PC2-0.04,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 0,size = 10)+
      # Subset some samples to be above left
  geom_text(data = subset(pca_data,sample %in% c("P24","M15","P36","P32","M48","M32")),aes(x=PC1-0.008,y=PC2+0.05,color = SAMPLING_station_number,label = SAMPLING_day),segment.alpha = 0,show_guide = FALSE,label.padding = 1,point.padding = 0.3,box.padding = 0.1,hjust = 1,size = 10)+
  # geom_text(hjust = 1, vjust=-0.5, segment.alpha = 0, show_guide = FALSE)+
  xlab(print(label.PC1))+
  ylab(print(label.PC2))+
  scale_color_manual(name = "Station", values = c("#e41a1c","#377eb8"), labels = c("Mesocosm 1", "Pacific"))+
  scale_shape_manual(name = "Station", values = c(19,17), labels = c("Mesocosm 1", "Pacific"))+
  annotate("text",  x=Inf, y = Inf, label = "(d)", vjust=1.5, hjust=1.3,size = 12)+
  theme(panel.background = element_rect(fill = "white",color = "black",size = 1),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.title = element_text(size = 20,face = "bold"),
        axis.text = element_blank(),
        axis.ticks.length=unit(0, "cm"),
        plot.margin = margin(0.25, 0, 0.25, 0.25, "cm"),
        legend.position = "none",
        plot.title = element_text(size = 50, face = "bold", hjust = 0.5))+
  xlim(min(pca_data$PC1)-0.03,max(pca_data$PC1)+0.03)+
  ggtitle("16S Bacterial")
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: Ignoring unknown parameters: segment.alpha, label.padding,
## point.padding, box.padding
## [1] "PC1 (62%)"
## [1] "PC2 (37.2%)"
pca_gg_sample

ggsave(here::here("figures", "PCA_figure", "16S_bacterial_pca_kosmos_3PCs_16samples_9.25.20.png"),pca_gg_sample,width = 7,height = 6)

##### PERMANOVA #####
# Extract metadata
pca_metadata <- pca_data
# pca_metadata %>% dplyr::rename(., "SampleID" = "#SampleID") -> pca_metadata

# Create a new column to compare M1 post OMZ water addition with other samples
pca_metadata %>% 
  mutate(permanova_group = ifelse(SAMPLING_station == "Pacific" | SAMPLING_station == "Mesocosm 1" & sample == "M1" | SAMPLING_station == "Mesocosm 1" & sample == "M8",
                                  "Pacific & M1 pre-OMZ", "M1 post-OMZ")) -> pca_metadata

## Load COI Data
distance <- read_qza(here::here("local_deicode_runs", "16S_bacterial", "distance.qza"))

# Extract distance matrix
distance_matrix <- distance$data

# convert DEICODE matrix to "dist" class object
PCA_dist <- as.dist(distance_matrix)
##----Run individual PERMANOVAs-------------------------------------------------------------

# PERMANOVA for CTD vs. ESP
set.seed(123)
M1_v_Pacific_permanova_16S_bacteria <- adonis2(PCA_dist ~ permanova_group, data = pca_metadata, permutations=999)
M1_v_Pacific_permanova_16S_bacteria
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = PCA_dist ~ permanova_group, data = pca_metadata, permutations = 999)
##                 Df SumOfSqs      R2      F Pr(>F)    
## permanova_group  1   13.529 0.41943 10.114  0.001 ***
## Residual        14   18.727 0.58057                  
## Total           15   32.256 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PC Loadings Analysis: Unmerged ASVs (Figure 3, bottom half)

Make the legend for all plots and create a color scheme

group_colors = c("Rhodophytes" = "#E15759",      
                 "Bicosoecids" = "#F28E2B",      
                 "Dinoflagellates" = "#B6992D",  
                 "Prymnesiales" = "#499894", 
                 "Diatoms" = "#4E79A7",          
                 "Chlorophyta" = "#59A14F",      
                 "Oomycetes" = "#F1CE63", 
                 "Coccolithophores" = "#A0CBE8", 
                 "Amoebozoa" = "#FF9D9A",        
                 "Cercozoa" = "#FFBE7D",
                 "Cryptophytes" = "#8CD17D",
                 "Embryophytes" = "#D7B5A6",
                 "Silicoflagellates" = "#86BCB6",
                 "Copepods" = "#BAB0AC",
                 "Rotifers" = "#79706E",
                 "Proteobacteria" = "#9D7660",
                 "Bacteroidetes" = "#D4A6C8",
                 "Actinobacteria" = "#FABFD2",
                 "Marinimicrobia" = "#6a3d9a",
                 "Verrucomicrobia" = "#D37295",
                 "Patescibacteria" = "#91003f",
                 "Cyanobacteria" = "#B07AA1",      
                 "Spirotrichs" = "#1b7837",
                 "Unassigned" = "gray10")

# Make a fake dataframe with all of the different taxa that we want
allgroups <- c("Rhodophytes",      
                 "Bicosoecids",      
                 "Dinoflagellates",  
                 "Prymnesiales", 
                 "Diatoms",          
                 "Chlorophyta",      
                 "Oomycetes",       
                 "Coccolithophores", 
                 "Amoebozoa",        
                 "Cercozoa",
                "Spirotrichs",
                 "Cryptophytes",
                 "Embryophytes",
                 "Silicoflagellates",
                 "Copepods",
                 "Rotifers",
                 "Proteobacteria",
                 "Bacteroidetes",
                 "Actinobacteria",
                 "Verrucomicrobia",
                 "Cyanobacteria",
                 "Patescibacteria",
                 "Marinimicrobia",
                  "Unassigned")

fakedata <- rep(0.05,24)
fakesample <- rep("nosample",24)
legend_data <- data.frame(allgroups,fakedata,fakesample)

legend_data$allgroups <- factor(legend_data$allgroups, levels = c("Rhodophytes",      
                 "Bicosoecids",      
                 "Dinoflagellates",  
                 "Prymnesiales", 
                 "Diatoms",          
                 "Chlorophyta",      
                 "Oomycetes",       
                 "Coccolithophores", 
                 "Amoebozoa",        
                 "Cercozoa",
                 "Spirotrichs",
                 "Cryptophytes",
                 "Embryophytes",
                 "Silicoflagellates",
                 "Copepods",
                 "Rotifers",
                 "Proteobacteria",
                 "Bacteroidetes",
                 "Actinobacteria",
                 "Verrucomicrobia",
                 "Cyanobacteria",
                 "Patescibacteria",
                 "Marinimicrobia",
                  "Unassigned"))

legend_gg<-ggplot(legend_data, aes(x=fakesample,y=fakedata,fill=allgroups))+
    geom_bar(stat='identity')+theme_minimal()+
    scale_fill_manual(values = group_colors)+
    theme(axis.text.x = element_blank(),
            legend.position = "right",
          axis.title=element_text(size=8),
          plot.margin = unit(c(0,5,0,0), "pt"),
          legend.text=element_text(size=10),
          legend.title = element_blank()
    )+
    guides(fill=guide_legend(ncol=8))+
    xlab("Sample")+
  ylab("Proportion of Reads")

legend_gg

ggsave(here::here("figures", "PCA_figure", "combined_figure_legend_top30_092520.png"),legend_gg,width = 21, height = 5)

# Make color scheme
group_colors = c("Rhodophytes" = "#E15759",      
                 "Bicosoecids" = "#F28E2B",      
                 "Dinoflagellates" = "#B6992D",  
                 "Prymnesiales" = "#499894", 
                 "Diatoms" = "#4E79A7",          
                 "Chlorophyta" = "#59A14F",      
                 "Oomycetes" = "#F1CE63", 
                 "Coccolithophores" = "#A0CBE8", 
                 "Amoebozoa" = "#FF9D9A",        
                 "Cercozoa" = "#FFBE7D",
                 "Cryptophytes" = "#8CD17D",
                 "Embryophytes" = "#D7B5A6",
                 "Silicoflagellates" = "#86BCB6",
                 "Copepods" = "#BAB0AC",
                 "Rotifers" = "#79706E",
                 "Proteobacteria" = "#9D7660",
                 "Bacteroidetes" = "#D4A6C8",
                 "Actinobacteria" = "#FABFD2",
                 "Marinimicrobia" = "#6a3d9a",
                 "Verrucomicrobia" = "#D37295",
                 "Patescibacteria" = "#91003f",
                 "Cyanobacteria" = "#B07AA1",      
                 "Spirotrichs" = "#1b7837",
                 "other" = "gray10")

COI

Load in PC loadings data

pco <- read_qza(here::here("local_deicode_runs", "COI", "ordination.qza"))
PC_loadings_COI <- pco$data$Species

# Scaling PCA?
# max_PC1 <- max(abs(PC_loadings_COI$PC1))
# PC_loadings_COI %>% mutate(.,PC1_scaled = PC_loadings_COI$PC1/max_PC1) -> PC_loadings_COI


# kosmos_COI_16_tax_table
kosmos_COI_tax_table <- dplyr::rename(kosmos_COI_16_tax_table,FeatureID = "#OTUID")
PC_loadings_COI <- left_join(PC_loadings_COI,kosmos_COI_tax_table,by = "FeatureID")
# Change all taxonomic annotations to character
PC_loadings_COI$Kingdom <- as.character(PC_loadings_COI$Kingdom)
PC_loadings_COI$Phylum <- as.character(PC_loadings_COI$Phylum)
PC_loadings_COI$Class <- as.character(PC_loadings_COI$Class)
PC_loadings_COI$Order <- as.character(PC_loadings_COI$Order)
PC_loadings_COI$Family <- as.character(PC_loadings_COI$Family)
PC_loadings_COI$Genus <- as.character(PC_loadings_COI$Genus)
PC_loadings_COI$Species <- as.character(PC_loadings_COI$Species)

# Add taxonomy field that shows the lowest level of taxonomy
PC_loadings_COI <- PC_loadings_COI[order(-PC_loadings_COI$PC1),]
PC_loadings_COI %>% mutate(.,taxonomy = ifelse(!(Species %in% c("unassigned","no_hit","s_","unknown")),Species,
                                               # ifelse(!(Genus %in% c("unassigned","no_hit","g_","unknown")),paste0(PC_loadings_COI$Genus, " sp."),
                                               ifelse(!(Genus %in% c("unassigned","no_hit","g_","unknown")),paste0(PC_loadings_COI$Genus),
                                                      ifelse(!(Family %in% c("unassigned","no_hit","unknown")),paste0("Family ", PC_loadings_COI$Family),
                                                             ifelse(!(Order %in% c("unassigned","no_hit","unknown")),paste0("Order ", PC_loadings_COI$Order),
                                                                    ifelse(!(Class %in% c("unassigned","no_hit","unknown")),paste0("Class ", PC_loadings_COI$Class),
                                                                           ifelse(!(Phylum %in% c("unassigned","no_hit","unknown")),paste0("Phylum ", PC_loadings_COI$Phylum),
                                                                                  "Unassigned"))))))) -> PC_loadings_COI

# Create new "Group" field that groups taxa together.

PC_loadings_COI %>% mutate(.,group = 
                                   # COI 
                                    ifelse(Phylum %in% c("Bacillariophyta"), "Diatoms",
                                    ifelse(Order %in% c("Isochrysidales"), "Coccolithophores",
                                    ifelse(Class %in% c("Dinophyceae"),"Dinoflagellates",
                                    ifelse(Phylum %in% c("Rhodophyta"),"Rhodophytes",
                                    ifelse(Phylum %in% c("Rotifera"),"Rotifers",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                    ifelse(Class %in% c("Oomycetes"),"Oomycetes",
                                    ifelse(Class %in% c("Hexanauplia"),"Copepods",
                                    ifelse(Family %in% c("Cafeteriaceae"),"Bicosoecids",
                                    ifelse(Family %in% c("Vexilliferidae"),"Amoebozoa",
                                    ifelse(Family %in% c("Paulinellidae"),"Cercozoa",
                                    ifelse(Family %in% c("Pycnococcaceae"),"Chlorophyta",
                                  
                                    # Chloroplast       
                                    ifelse(Class %in% c("Cryptophyceae"),"Cryptophytes",
                                    ifelse(Class %in% c("Embryophyceae"), "Embryophytes",
                                    ifelse(Class %in% c("Dictyophyceae"), "Silicoflagellates",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                    ifelse(Class %in% c("Trebouxiophyceae"),"Chlorophyta",
                                           "other")))))))))))))))))) -> PC_loadings_COI

#9D7660
#D4A6C8
#FABFD2
#BAB0AC
#F1CE63

unique(PC_loadings_COI$group)
##  [1] "Rotifers"         "Cercozoa"         "Coccolithophores" "Oomycetes"       
##  [5] "other"            "Amoebozoa"        "Copepods"         "Diatoms"         
##  [9] "Rhodophytes"      "Prymnesiales"     "Dinoflagellates"  "Bicosoecids"     
## [13] "Chlorophyta"
# Add in information from ASV_tophit_F_taxonomy table
tophit_taxonomy <- read.csv("/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/COI/X/ASV_tophit_Ftaxonomy.csv")
colnames(tophit_taxonomy)[1] <- "FeatureID"
colnames(tophit_taxonomy)[6] <- "precent_ID"

tophit_taxonomy <- tophit_taxonomy[,c(1:6,9)]
PC_loadings_COI <- left_join(PC_loadings_COI,tophit_taxonomy,by = "FeatureID")



#write.csv(PC_loadings_COI,"/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/PC_loadings_COI.csv")

Subset out only abundant ASVs based on cutoff value

### Normalize each sample to 1, have the proportion of reads be for each sample
kosmos_COI_asv_table <- as.data.frame(otu_table(kosmos_COI_16_phyloseq))
kosmos_COI_asv_table_matrix <- as.matrix(kosmos_COI_asv_table)
kosmos_COI_sample_sums <- as.vector(colSums(kosmos_COI_asv_table))
t(t(kosmos_COI_asv_table_matrix)/kosmos_COI_sample_sums) -> kosmos_COI_normalized_asv_table
kosmos_COI_normalized_asv_table <- as.data.frame(kosmos_COI_normalized_asv_table)

# Subset the ASV table based on a relative abundance threshold in multiple samples
# The problem with this approach is that it can eliminate taxa that don't have a lot of reads because of amplification bias, like Akashiwo
## Set threshold to at least 0.001 in at least 4 samples -> gives us 88 ASVs. Totally manageable. This is loose enough to keep Akashiwo in the picture.
# Add a column to the normalized asv table to store number of values over our cutoff (in this case 0.001)
kosmos_COI_normalized_asv_table %>% rownames_to_column(.,"ASV") -> kosmos_COI_normalized_asv_table
kosmos_COI_normalized_asv_table %>% mutate(.,novercutoff = 0) -> kosmos_COI_normalized_asv_table
kosmos_COI_normalized_asv_table %>% column_to_rownames(.,"ASV") -> kosmos_COI_normalized_asv_table
for (i in seq(1:nrow(kosmos_COI_normalized_asv_table))){
  kosmos_COI_normalized_asv_table$novercutoff[i] = sum(kosmos_COI_normalized_asv_table[i,] > 0.001)
}
kosmos_COI_normalized_asv_table_subset <- subset(kosmos_COI_normalized_asv_table, novercutoff >= 4)
kosmos_COI_normalized_asv_table_subset <- kosmos_COI_normalized_asv_table_subset[!(names(kosmos_COI_normalized_asv_table_subset) %in% c("novercutoff"))]

# Save ASVs to keep as a list:
abundant_ASVs <- rownames(kosmos_COI_normalized_asv_table_subset)

Make PC1 loadings plot of only abundant ASVs

PC_loadings_COI_subset <- subset(PC_loadings_COI, FeatureID %in% abundant_ASVs)

kosmos_COI_asv_table_for_supp <- rownames_to_column(kosmos_COI_asv_table, "FeatureID")
supp_table <- left_join(PC_loadings_COI_subset,kosmos_COI_asv_table_for_supp, by = "FeatureID")
# write.csv(supp_table,"/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/temp/supplementary_table_1.csv")

# Plot all ASVs that are abundant (for supplemental figures)
COI_PCA_gg <- ggplot(PC_loadings_COI_subset,aes(x = reorder(FeatureID,PC1),y = PC1, fill = Class))+
  geom_bar(stat = "identity")+
  ylim(-0.25,0.25)+
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.length.x=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text.y = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title.y = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  annotate("segment", x = 15.5,xend = 15.5, y = 0, yend = -0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 0,xend = 15.5, y = -0.2, yend = -0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 0,xend = 15.5, y = 0, yend = 0, color = "black", size = 1, lty = 3)+  
  annotate("text", x = 8, y = -0.175,label = "Top 15 ASVs\n(negative)",size = 4.5,fontface = "bold")+
  annotate("segment", x = 72.5,xend = 72.5, y = 0, yend = 0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 72.5,xend = 88, y = 0.2, yend = 0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 72.5,xend = 88, y = 0, yend = 0, color = "black", size = 1, lty = 3)+  
  annotate("text", x = 80, y = 0.175,label = "Top 15 ASVs\n(positive)",size = 4.5,fontface = "bold")+
  # annotate("text",  x=Inf, y = Inf, label = "(a)", vjust=1.5, hjust=1.25,size = 20)+
  scale_fill_tableau(palette = "Tableau 20")


COI_PCA_gg

# ggsave("/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/temp/COI_PC1_unmerged_all_052220.png",COI_PCA_gg,width = 10, height = 7.5)

# Plot all taxa, top 15 on either side

PC_loadings_COI_30 <- PC_loadings_COI_subset[c(1:15, (nrow(PC_loadings_COI_subset)-14):nrow(PC_loadings_COI_subset)),]

COI_PCA_gg <- ggplot(PC_loadings_COI_30,aes(x = reorder(FeatureID,PC1),y = PC1, fill = group))+
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(data = PC_loadings_COI_30,aes(x = reorder(FeatureID,PC1),y = ifelse(PC1 > 0, -0.01,0.01), label = taxonomy, hjust = ifelse(PC1 > 0, 1,0),color = ifelse(taxonomy =="Unassigned","gray60","black"),fontface = ifelse(!(Genus %in% c("g_","unassigned","no_hit")),"bold.italic","bold")), size = 5)+
  ylim(-0.21,0.21)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.length.y=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        legend.position = "none",
        axis.text.x = element_text(size = 15),
        axis.ticks.length.x = unit(0.25,"cm"),
        axis.title.x = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        plot.margin = margin(0, 0, 0, 0.95, "cm"),
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  scale_fill_manual(values = group_colors)+
  scale_color_manual(values = c("black","gray60"))+
  annotate("text",  x=Inf, y = Inf, label = "(e)", vjust=1.5, hjust=1.25,size = 12)

COI_PCA_gg

ggsave(here::here("figures", "PCA_figure", "COI_PC1_unmerged_top30_072720.png"),COI_PCA_gg,width = 7, height = 8.5)

18S

Load in PC loadings data

pco <- read_qza(here::here("local_deicode_runs", "18S", "ordination.qza"))
PC_loadings_18S <- pco$data$Species

# Scaling PCA?
# max_PC1 <- max(abs(PC_loadings_18S$PC1))
# PC_loadings_18S %>% mutate(.,PC1_scaled = PC_loadings_18S$PC1/max_PC1) -> PC_loadings_18S


# kosmos_18S_16_tax_table
kosmos_18S_tax_table <- dplyr::rename(kosmos_18S_16_tax_table,FeatureID = "#OTUID")
PC_loadings_18S <- left_join(PC_loadings_18S,kosmos_18S_tax_table,by = "FeatureID")
# Change all taxonomic annotations to character
PC_loadings_18S$Kingdom <- as.character(PC_loadings_18S$Kingdom)
PC_loadings_18S$Phylum <- as.character(PC_loadings_18S$Phylum)
PC_loadings_18S$Class <- as.character(PC_loadings_18S$Class)
PC_loadings_18S$Order <- as.character(PC_loadings_18S$Order)
PC_loadings_18S$Family <- as.character(PC_loadings_18S$Family)
PC_loadings_18S$Genus <- as.character(PC_loadings_18S$Genus)
PC_loadings_18S$Species <- as.character(PC_loadings_18S$Species)

# Add taxonomy field that shows the lowest level of taxonomy
PC_loadings_18S <- PC_loadings_18S[order(-PC_loadings_18S$PC1),]
PC_loadings_18S %>% mutate(.,taxonomy = ifelse(!(Species %in% c("unassigned","no_hit","s_","unknown","uncultured marine picoeukaryote")),Species,
                                               # ifelse(!(Genus %in% c("unassigned","no_hit","g_","unknown")),paste0(PC_loadings_18S$Genus, " sp."),
                                               ifelse(!(Genus %in% c("unassigned","no_hit","g_","unknown")),paste0(PC_loadings_18S$Genus),
                                                      ifelse(!(Family %in% c("unassigned","no_hit","unknown")),paste0("Family ", PC_loadings_18S$Family),
                                                             ifelse(!(Order %in% c("unassigned","no_hit","unknown")),paste0("Order ", PC_loadings_18S$Order),
                                                                    ifelse(!(Class %in% c("unassigned","no_hit","unknown")),paste0("Class ", PC_loadings_18S$Class),
                                                                           ifelse(!(Phylum %in% c("unassigned","no_hit","unknown")),paste0("Phylum ", PC_loadings_18S$Phylum),
                                                                                  "Unassigned"))))))) -> PC_loadings_18S

# Create new "Group" field that groups taxa together.

PC_loadings_18S %>% mutate(.,group = 
                             # 18S & COI
                             ifelse(Phylum %in% c("Bacillariophyta"), "Diatoms",
                                    ifelse(Order %in% c("Isochrysidales"), "Coccolithophores",
                                           ifelse(Class %in% c("Dinophyceae"),"Dinoflagellates",
                                                  ifelse(Phylum %in% c("Rhodophyta"),"Rhodophytes",
                                                         ifelse(Phylum %in% c("Rotifera"),"Rotifers",
                                                                ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                                                       ifelse(Class %in% c("Oomycetes"),"Oomycetes",
                                                                              ifelse(Class %in% c("Hexanauplia"),"Copepods",
                                                                                     ifelse(Family %in% c("Cafeteriaceae"),"Bicosoecids",
                                                                                            ifelse(Family %in% c("Vexilliferidae"),"Amoebozoa",
                                                                                                   ifelse(Family %in% c("Paulinellidae") | Genus == "Protaspis","Cercozoa",
                                                                                                          ifelse(Family %in% c("Pycnococcaceae"),"Chlorophyta",
                                                                                                                 ifelse(Class == "Spirotrichea", "Spirotrichs",
                                                                                                                 
                                                                                                                 # Chloroplast       
                                                                                                                 ifelse(Class %in% c("Cryptophyceae"),"Cryptophytes",
                                                                                                                        ifelse(Class %in% c("Embryophyceae"), "Embryophytes",
                                                                                                                               ifelse(Class %in% c("Dictyophyceae"), "Silicoflagellates",
                                                                                                                                      ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                                                                                                                             ifelse(Class %in% c("Trebouxiophyceae"),"Chlorophyta",
                                                                                                                                                    "other"))))))))))))))))))) -> PC_loadings_18S

#9D7660
#D4A6C8
#FABFD2
#BAB0AC
#F1CE63

# unique(PC_loadings_18S$group)

# Add in information from ASV_tophit_F_taxonomy table
tophit_taxonomy <- read.csv("/Users/markusmin/Documents/MBARI-2167/local/eDNA_meta_analysis_local_data/banzai_dada2/18S/UU/ASV_tophit_Ftaxonomy.csv")
colnames(tophit_taxonomy)[1] <- "FeatureID"
colnames(tophit_taxonomy)[6] <- "precent_ID"

tophit_taxonomy <- tophit_taxonomy[,c(1:6,9)]
PC_loadings_18S <- left_join(PC_loadings_18S,tophit_taxonomy,by = "FeatureID")



#write.csv(PC_loadings_18S,"/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/PC_loadings_18S.csv")

Subset out only abundant ASVs based on cutoff value

### Normalize each sample to 1, have the proportion of reads be for each sample
kosmos_18S_asv_table <- as.data.frame(otu_table(kosmos_18S_16_phyloseq))
kosmos_18S_asv_table_matrix <- as.matrix(kosmos_18S_asv_table)
kosmos_18S_sample_sums <- as.vector(colSums(kosmos_18S_asv_table))
t(t(kosmos_18S_asv_table_matrix)/kosmos_18S_sample_sums) -> kosmos_18S_normalized_asv_table
kosmos_18S_normalized_asv_table <- as.data.frame(kosmos_18S_normalized_asv_table)

# Subset the ASV table based on a relative abundance threshold in multiple samples
# The problem with this approach is that it can eliminate taxa that don't have a lot of reads because of amplification bias, like Akashiwo
## Set threshold to at least 0.001 in at least 4 samples -> gives us 88 ASVs. Totally manageable. This is loose enough to keep Akashiwo in the picture.
# Add a column to the normalized asv table to store number of values over our cutoff (in this case 0.001)
kosmos_18S_normalized_asv_table %>% rownames_to_column(.,"ASV") -> kosmos_18S_normalized_asv_table
kosmos_18S_normalized_asv_table %>% mutate(.,novercutoff = 0) -> kosmos_18S_normalized_asv_table
kosmos_18S_normalized_asv_table %>% column_to_rownames(.,"ASV") -> kosmos_18S_normalized_asv_table
for (i in seq(1:nrow(kosmos_18S_normalized_asv_table))){
  kosmos_18S_normalized_asv_table$novercutoff[i] = sum(kosmos_18S_normalized_asv_table[i,] > 0.001)
}
kosmos_18S_normalized_asv_table_subset <- subset(kosmos_18S_normalized_asv_table, novercutoff >= 4)
kosmos_18S_normalized_asv_table_subset <- kosmos_18S_normalized_asv_table_subset[!(names(kosmos_18S_normalized_asv_table_subset) %in% c("novercutoff"))]

# Save ASVs to keep as a list:
abundant_ASVs <- rownames(kosmos_18S_normalized_asv_table_subset)

Make PC1 loadings plot of only abundant ASVs

PC_loadings_18S_subset <- subset(PC_loadings_18S, FeatureID %in% abundant_ASVs)

kosmos_18S_asv_table_for_supp <- rownames_to_column(kosmos_18S_asv_table, "FeatureID")
supp_table <- left_join(PC_loadings_18S_subset,kosmos_18S_asv_table_for_supp, by = "FeatureID")
# write.csv(supp_table,"/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/temp/supplementary_table_1.csv")

# Plot all ASVs that are abundant (for supplemental figures)
PCA_gg_18S <- ggplot(PC_loadings_18S_subset,aes(x = reorder(FeatureID,PC1),y = PC1, fill = Class))+
  geom_bar(stat = "identity")+
  ylim(-0.25,0.25)+
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.length.x=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text.y = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title.y = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  annotate("segment", x = 15.5,xend = 15.5, y = 0, yend = -0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 0,xend = 15.5, y = -0.2, yend = -0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 0,xend = 15.5, y = 0, yend = 0, color = "black", size = 1, lty = 3)+  
  annotate("text", x = 8, y = -0.175,label = "Top 15 ASVs\n(negative)",size = 4.5,fontface = "bold")+
  annotate("segment", x = 72.5,xend = 72.5, y = 0, yend = 0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 72.5,xend = 88, y = 0.2, yend = 0.2, color = "black", size = 1, lty = 3)+
  annotate("segment", x = 72.5,xend = 88, y = 0, yend = 0, color = "black", size = 1, lty = 3)+  
  annotate("text", x = 80, y = 0.175,label = "Top 15 ASVs\n(positive)",size = 4.5,fontface = "bold")+
  # annotate("text",  x=Inf, y = Inf, label = "(a)", vjust=1.5, hjust=1.25,size = 20)+
  scale_fill_tableau(palette = "Tableau 20")


PCA_gg_18S

# ggsave("/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/temp/18S_PC1_unmerged_all_052220.png",PCA_gg_18S,width = 10, height = 7.5)

# Plot all taxa, top 15 on either side

PC_loadings_18S_30 <- PC_loadings_18S_subset[c(1:15, (nrow(PC_loadings_18S_subset)-14):nrow(PC_loadings_18S_subset)),]

PCA_gg_18S <- ggplot(PC_loadings_18S_30,aes(x = reorder(FeatureID,PC1),y = PC1, fill = group))+
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(data = PC_loadings_18S_30,aes(x = reorder(FeatureID,PC1),y = ifelse(PC1 > 0, -0.01,0.01), label = taxonomy, hjust = ifelse(PC1 > 0, 1,0),color = ifelse(taxonomy =="Unassigned","gray60","black"),fontface = ifelse(!(Genus %in% c("g_","unassigned","no_hit")),"bold.italic","bold")), size = 5)+
  ylim(-0.21,0.21)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.length.y=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        legend.position = "none",
        axis.text.x = element_text(size = 15),
        axis.ticks.length.x = unit(0.25,"cm"),
        axis.title.x = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        plot.margin = margin(0, 0, 0, 0.95, "cm"),
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  scale_fill_manual(values = group_colors)+
  scale_color_manual(values = c("black","gray60"))+
  annotate("text",  x=Inf, y = Inf, label = "(f)", vjust=1.5, hjust=1.25,size = 12)

PCA_gg_18S

ggsave(here::here("figures", "PCA_figure", "18S_PC1_unmerged_top30_092520.png"),PCA_gg_18S,width = 7, height = 8.5)

chloroplast (16S Plastidial)

Load in PC loadings data

pco<-read_qza(here::here("local_deicode_runs", "16S_plastidial", "ordination.qza"))
PC_loadings_chloroplast <- pco$data$Species
kosmos_chloroplast_tax_table <- dplyr::rename(chloroplast_16_tax_table,FeatureID = "#OTUID")
PC_loadings_chloroplast <- left_join(PC_loadings_chloroplast,kosmos_chloroplast_tax_table,by = "FeatureID")
# Change all taxonomic annotations to character
PC_loadings_chloroplast$Kingdom <- as.character(PC_loadings_chloroplast$Kingdom)
PC_loadings_chloroplast$Supergroup <- as.character(PC_loadings_chloroplast$Supergroup)
PC_loadings_chloroplast$Phylum <- as.character(PC_loadings_chloroplast$Phylum)
PC_loadings_chloroplast$Class <- as.character(PC_loadings_chloroplast$Class)
PC_loadings_chloroplast$Subclass <- as.character(PC_loadings_chloroplast$Subclass)
PC_loadings_chloroplast$Order <- as.character(PC_loadings_chloroplast$Order)
PC_loadings_chloroplast$Family <- as.character(PC_loadings_chloroplast$Family)
PC_loadings_chloroplast$Genus <- as.character(PC_loadings_chloroplast$Genus)
PC_loadings_chloroplast$Species <- as.character(PC_loadings_chloroplast$Species)

# Add new taxonomy column
PC_loadings_chloroplast <- PC_loadings_chloroplast[order(-PC_loadings_chloroplast$PC1),]
PC_loadings_chloroplast %>% mutate(.,taxonomy = ifelse(!(Species %in% c("")),Species,
                                                       ifelse(!(Genus %in% c("")),paste0(PC_loadings_chloroplast$Genus),
                                               # ifelse(!(Genus %in% c("")),paste0(PC_loadings_chloroplast$Genus, " sp."),
                                                      ifelse(!(Family %in% c("")),paste0("Family ", PC_loadings_chloroplast$Family),
                                                             ifelse(!(Order %in% c("")),paste0("Order ", PC_loadings_chloroplast$Order),
                                                                    ifelse(!(Subclass %in% c("")),paste0("Subclass ", PC_loadings_chloroplast$Subclass),
                                                                      ifelse(!(Class %in% c("")),paste0("Class ", PC_loadings_chloroplast$Class),
                                                                           ifelse(!(Phylum %in% c("")),paste0("Phylum ", PC_loadings_chloroplast$Phylum),
                                                                              "Unassigned")))))))) -> PC_loadings_chloroplast


PC_loadings_chloroplast %>% mutate(.,group = 
                                   # COI 
                                    ifelse(Phylum %in% c("Bacillariophyta"), "Diatoms",
                                    ifelse(Class %in% c("Bacillariophyta"), "Diatoms",
                                    ifelse(Order %in% c("Isochrysidales"), "Coccolithophores",
                                    ifelse(Class %in% c("Dinophyceae"),"Dinoflagellates",
                                    ifelse(Phylum %in% c("Rhodophyta"),"Rhodophytes",
                                    ifelse(Phylum %in% c("Rotifera"),"Rotifers",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                    ifelse(Class %in% c("Oomycetes"),"Oomycetes",
                                    ifelse(Class %in% c("Hexanauplia"),"Copepods",
                                    ifelse(Family %in% c("Cafeteriaceae"),"Bicosoecids",
                                    ifelse(Family %in% c("Vexilliferidae"),"Amoebozoa",
                                    ifelse(Family %in% c("Paulinellidae"),"Cercozoa",
                                    ifelse(Family %in% c("Pycnococcaceae"),"Chlorophyta",
                                  
                                    # Chloroplast       
                                    ifelse(Class %in% c("Cryptophyceae"),"Cryptophytes",
                                    ifelse(Class %in% c("Embryophyceae"), "Embryophytes",
                                    ifelse(Class %in% c("Dictyochophyceae"), "Silicoflagellates",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Prymnesiales",
                                    ifelse(Class %in% c("Trebouxiophyceae"),"Chlorophyta",
                                           "other"))))))))))))))))))) -> PC_loadings_chloroplast

Subset out only abundant ASVs based on cutoff value

### Normalize each sample to 1, have the proportion of reads be for each sample
chloroplast_asv_table <- as.data.frame(otu_table(chloroplast_16_phyloseq))
chloroplast_asv_table_matrix <- as.matrix(chloroplast_asv_table)
chloroplast_sample_sums <- as.vector(colSums(chloroplast_asv_table))
t(t(chloroplast_asv_table_matrix)/chloroplast_sample_sums) -> chloroplast_normalized_asv_table
chloroplast_normalized_asv_table <- as.data.frame(chloroplast_normalized_asv_table)

# Subset the ASV table based on a relative abundance threshold in multiple samples
# The problem with this approach is that it can eliminate taxa that don't have a lot of reads because of amplification bias, like Akashiwo
## Set threshold to at least 0.001 in at least 4 samples -> gives us 88 ASVs. Totally manageable. This is loose enough to keep Akashiwo in the picture.
# Add a column to the normalized asv table to store number of values over our cutoff (in this case 0.001)
chloroplast_normalized_asv_table %>% mutate(.,novercutoff = 0) -> chloroplast_normalized_asv_table
for (i in seq(1:nrow(chloroplast_normalized_asv_table))){
  chloroplast_normalized_asv_table$novercutoff[i] = sum(chloroplast_normalized_asv_table[i,] > 0.001)
}
chloroplast_normalized_asv_table_subset <- subset(chloroplast_normalized_asv_table, novercutoff >= 4)
chloroplast_normalized_asv_table_subset <- chloroplast_normalized_asv_table_subset[!(names(chloroplast_normalized_asv_table_subset) %in% c("novercutoff"))]

# Save ASVs to keep as a list:
abundant_ASVs <- rownames(chloroplast_normalized_asv_table_subset)

Make PC1 loadings plot of only abundant ASVs

PC_loadings_chloroplast_subset <- subset(PC_loadings_chloroplast, FeatureID %in% abundant_ASVs)

chloroplast_PCA_gg <- ggplot(PC_loadings_chloroplast_subset,aes(x = reorder(FeatureID,PC1),y = PC1, fill = Family))+
  geom_bar(stat = "identity")+
  ylim(-0.31,0.31)+
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.length.x=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text.y = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title.y = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  # scale_fill_manual(values = PC_loadings_colors)+
  annotate("text",  x=Inf, y = Inf, label = "(a)", vjust=1.5, hjust=1.25,size = 20)

chloroplast_PCA_gg

# Plot top 15 positive and negative, all reads

PC_loadings_chloroplast_30 <- PC_loadings_chloroplast_subset[c(1:15, (nrow(PC_loadings_chloroplast_subset)-14):nrow(PC_loadings_chloroplast_subset)),]

chloroplast_PCA_gg <- ggplot(PC_loadings_chloroplast_30,aes(x = reorder(FeatureID,PC1),y = PC1, fill = group))+
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(data = PC_loadings_chloroplast_30,aes(x = reorder(FeatureID,PC1),y = ifelse(PC1 > 0, -0.01,0.01), label = taxonomy, hjust = ifelse(PC1 > 0, 1,0),color = ifelse(taxonomy =="Unassigned","gray60","black")), size = 5,fontface = "bold")+
  ylim(-0.4,0.4)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.length.y=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        legend.position = "none",
        axis.text.x = element_text(size = 15),
        axis.ticks.length.x = unit(0.25,"cm"),
        axis.title.x = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        plot.margin = margin(0, 0, 0, 0.95, "cm"),
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  scale_fill_manual(values = group_colors)+
  scale_color_manual(values = c("black","gray60"))+
  annotate("text",  x=Inf, y = Inf, label = "(g)", vjust=1.5, hjust=1.25,size = 12)

chloroplast_PCA_gg

ggsave(here::here("figures", "PCA_figure", "16S_plastidial_PC1_unmerged_top30_092520.png"),chloroplast_PCA_gg,width = 7, height = 8.5)

16S Bacterial

bacteria unmerged

pco<-read_qza(here::here("local_deicode_runs", "16S_bacterial", "ordination.qza"))
PC_loadings_bacteria <- pco$data$Species
kosmos_bacteria_tax_table <- dplyr::rename(bacteria_16_tax_table,FeatureID = "#OTUID")
PC_loadings_bacteria <- left_join(PC_loadings_bacteria,kosmos_bacteria_tax_table,by = "FeatureID")
# Change all taxonomic annotations to character
PC_loadings_bacteria$Kingdom <- as.character(PC_loadings_bacteria$Kingdom)
PC_loadings_bacteria$Phylum <- as.character(PC_loadings_bacteria$Phylum)
PC_loadings_bacteria$Class <- as.character(PC_loadings_bacteria$Class)
PC_loadings_bacteria$Order <- as.character(PC_loadings_bacteria$Order)
PC_loadings_bacteria$Family <- as.character(PC_loadings_bacteria$Family)
PC_loadings_bacteria$Genus <- as.character(PC_loadings_bacteria$Genus)
PC_loadings_bacteria$Species <- as.character(PC_loadings_bacteria$Species)


# Based on David's note, change the taxonomy of the "unassigned" bacterial ASV to Class Alphaproteobacteria, Phylum Proteobacteria, Kingdom Bacteria
PC_loadings_bacteria <- within(PC_loadings_bacteria, Kingdom[FeatureID == 'ec69e4a2fa57ecb7e50cf2e897b37a8e$Unassigned'] <- 'Bacteria')
PC_loadings_bacteria <- within(PC_loadings_bacteria, Phylum[FeatureID == 'ec69e4a2fa57ecb7e50cf2e897b37a8e$Unassigned'] <- 'Proteobacteria')
PC_loadings_bacteria <- within(PC_loadings_bacteria, Class[FeatureID == 'ec69e4a2fa57ecb7e50cf2e897b37a8e$Unassigned'] <- 'Alphaproteobacteria')


# Sort by PC1 loadings
PC_loadings_bacteria <- PC_loadings_bacteria[order(-PC_loadings_bacteria$PC1),]
PC_loadings_bacteria %>% mutate(.,taxonomy = ifelse(!(Species %in% c("","uncultured_bacterium","uncultured_marine_bacterium","uncultured")),Species,
                                                    ifelse(!(Genus %in% c("","uncultured_bacterium","uncultured_marine_bacterium","uncultured")),paste0(PC_loadings_bacteria$Genus),
                                               # ifelse(!(Genus %in% c("","uncultured_bacterium","uncultured_marine_bacterium","uncultured")),paste0(PC_loadings_bacteria$Genus, " sp."),
                                                      ifelse(!(Family %in% c("","uncultured_bacterium","uncultured_marine_bacterium","uncultured")),paste0("Family ", PC_loadings_bacteria$Family),
                                                             ifelse(!(Order %in% c("","uncultured_bacterium","uncultured_marine_bacterium","uncultured")),paste0("Order ", PC_loadings_bacteria$Order),
                                                                      ifelse(!(Class %in% c("")),paste0("Class ", PC_loadings_bacteria$Class),
                                                                           ifelse(!(Phylum %in% c("")),paste0("Phylum ", PC_loadings_bacteria$Phylum),
                                                                                  ifelse(!(Kingdom %in% c("","Unassigned")),paste0("Kingdom ", PC_loadings_bacteria$Kingdom),"Unassigned")))))))) -> PC_loadings_bacteria

PC_loadings_bacteria %>% mutate(.,group = 
                                   # COI 
                                    ifelse(Phylum %in% c("Bacillariophyta"), "Diatoms",
                                    ifelse(Order %in% c("Isochrysidales"), "Coccolithophores",
                                    ifelse(Class %in% c("Dinophyceae"),"Dinoflagellates",
                                    ifelse(Phylum %in% c("Rhodophyta"),"Rhodophytes",
                                    ifelse(Phylum %in% c("Rotifera"),"Rotifers",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Chrysochromulina",
                                    ifelse(Class %in% c("Oomycetes"),"Oomycetes",
                                    ifelse(Class %in% c("Hexanauplia"),"Copepods",
                                    ifelse(Family %in% c("Cafeteriaceae"),"Bicosoecids",
                                    ifelse(Family %in% c("Vexilliferidae"),"Amoebozoa",
                                    ifelse(Family %in% c("Paulinellidae"),"Cercozoa",
                                    ifelse(Family %in% c("Pycnococcaceae"),"Chlorophyta",
                                  
                                    # Chloroplast       
                                    ifelse(Class %in% c("Cryptophyceae"),"Cryptophytes",
                                    ifelse(Class %in% c("Embryophyceae"), "Embryophytes",
                                    ifelse(Class %in% c("Dictyophyceae"), "Silicoflagellates",
                                    ifelse(Family %in% c("Chrysochromulinaceae"),"Chrysochromulina",
                                    ifelse(Class %in% c("Trebouxiophyceae"),"Chlorophyta",
                                           "other")))))))))))))))))) -> PC_loadings_bacteria

# write.csv(PC_loadings_bacteria,"/Users/mmin/Documents/KOSMOS_data_local/PC_loadings_analysis/PC_loadings_bacteria.csv")

Subset out only abundant ASVs based on cutoff value

### Normalize each sample to 1, have the proportion of reads be for each sample
bacteria_asv_table <- as.data.frame(otu_table(bacteria_16_phyloseq))
bacteria_asv_table_matrix <- as.matrix(bacteria_asv_table)
bacteria_sample_sums <- as.vector(colSums(bacteria_asv_table))
t(t(bacteria_asv_table_matrix)/bacteria_sample_sums) -> bacteria_normalized_asv_table
bacteria_normalized_asv_table <- as.data.frame(bacteria_normalized_asv_table)

# Subset the ASV table based on a relative abundance threshold in multiple samples
# The problem with this approach is that it can eliminate taxa that don't have a lot of reads because of amplification bias, like Akashiwo
## Set threshold to at least 0.001 in at least 4 samples -> gives us 88 ASVs. Totally manageable. This is loose enough to keep Akashiwo in the picture.
# Add a column to the normalized asv table to store number of values over our cutoff (in this case 0.001)
bacteria_normalized_asv_table <- rownames_to_column(bacteria_normalized_asv_table, "ASV")
bacteria_normalized_asv_table %>% mutate(.,novercutoff = 0) -> bacteria_normalized_asv_table
bacteria_normalized_asv_table <- column_to_rownames(bacteria_normalized_asv_table, "ASV")
for (i in seq(1:nrow(bacteria_normalized_asv_table))){
  bacteria_normalized_asv_table$novercutoff[i] = sum(bacteria_normalized_asv_table[i,] > 0.001)
}
bacteria_normalized_asv_table_subset <- subset(bacteria_normalized_asv_table, novercutoff >= 4)
bacteria_normalized_asv_table_subset <- bacteria_normalized_asv_table_subset[!(names(bacteria_normalized_asv_table_subset) %in% c("novercutoff"))]

# Save ASVs to keep as a list:
abundant_ASVs <- rownames(bacteria_normalized_asv_table_subset)

Make PC1 loadings plot of only abundant ASVs

PC_loadings_bacteria_subset <- subset(PC_loadings_bacteria, FeatureID %in% abundant_ASVs)

bacteria_PCA_gg <- ggplot(PC_loadings_bacteria_subset,aes(x = reorder(FeatureID,PC1),y = PC1, fill = Family))+
  geom_bar(stat = "identity")+
  ylim(-0.31,0.31)+
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.length.x=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text.y = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title.y = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  # scale_fill_manual(values = PC_loadings_colors)+
  annotate("text",  x=Inf, y = Inf, label = "(a)", vjust=1.5, hjust=1.25,size = 20)

bacteria_PCA_gg

# Plot top 15 positive and negative, all ASVs

PC_loadings_bacteria_30 <- PC_loadings_bacteria_subset[c(1:15, (nrow(PC_loadings_bacteria_subset)-14):nrow(PC_loadings_bacteria_subset)),]


# Change some labels
## Marinimicrobia
PC_loadings_bacteria_30 %>% mutate(.,Phylum = ifelse(Phylum == "Marinimicrobia_(SAR406_clade)","Marinimicrobia",
                                                    ifelse(Phylum == "","other", Phylum))) -> PC_loadings_bacteria_30
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = ifelse(taxonomy == "Phylum Marinimicrobia_(SAR406_clade)","Phylum Marinimicrobia",taxonomy)) -> PC_loadings_bacteria_30
## Clade la
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = ifelse(taxonomy == "Clade_Ia","SAR11 Ia",taxonomy)) -> PC_loadings_bacteria_30
## NS9_marine_group
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = ifelse(taxonomy == "Family NS9_marine_group","NS9_marine_group",taxonomy)) -> PC_loadings_bacteria_30
## SAR86
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = ifelse(taxonomy == "Order SAR86_clade","SAR86 clade",taxonomy)) -> PC_loadings_bacteria_30

# Change the label of Synechococcus CC9902, as suggested by Sebastian
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = ifelse(taxonomy == "Synechococcus_CC9902","Family Synechococcaceae",taxonomy)) -> PC_loadings_bacteria_30

# Remove underscores from group labels
PC_loadings_bacteria_30 %>% mutate(.,taxonomy = gsub("_"," ",taxonomy)) -> PC_loadings_bacteria_30

bacteria_PCA_gg <- ggplot(PC_loadings_bacteria_30,aes(x = reorder(FeatureID,PC1),y = PC1, fill = Phylum))+
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(data = PC_loadings_bacteria_30,aes(x = reorder(FeatureID,PC1),y = ifelse(PC1 > 0, -0.01,0.01), label = taxonomy, hjust = ifelse(PC1 > 0, 1,0),color = ifelse(taxonomy =="Unassigned","gray60","black")),size = 5,fontface = ifelse(!(PC_loadings_bacteria_30$Genus %in% c("","Synechococcus_CC9902")|grepl("group",PC_loadings_bacteria_30$Genus)|grepl("clade",PC_loadings_bacteria_30$Genus,ignore.case = TRUE)),"bold.italic","bold"))+
  ylim(-0.21,0.21)+
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.length.y=unit(0, "cm"),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        legend.position = "none",
        axis.text.x = element_text(size = 15),
        axis.ticks.length.x = unit(0.25,"cm"),
        axis.title.x = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        plot.margin = margin(0, 0, 0, 0.95, "cm"),
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank())+
  scale_fill_manual(values = group_colors)+
  scale_color_manual(values = c("black","gray60"))+
  annotate("text",  x=Inf, y = Inf, label = "(h)", vjust=1.5, hjust=1.25,size = 12)

bacteria_PCA_gg

ggsave(here::here("figures", "PCA_figure", "16S_bacterial_PC1_unmerged_top30_100520.png"),bacteria_PCA_gg,width = 7, height = 8.5)

Vertebrate detections (Figure 4)

Some summary statistics

kosmos_12S_sample_sums <- as.data.frame(sample_sums(kosmos_12S_16_phyloseq))
kosmos_12S_sample_sums <- rownames_to_column(kosmos_12S_sample_sums,"sample")
colnames(kosmos_12S_sample_sums)[2] <- "total_reads"
kosmos_12S_sample_sums %>% mutate(.,sample_name = substr(gsub("KOSMOS","",sample),1,nchar(gsub("KOSMOS","",sample))-5)) -> kosmos_12S_sample_sums

# Add a mesocosm vs. Pacific column
kosmos_12S_sample_sums$sample_name <- as.character(kosmos_12S_sample_sums$sample_name)
kosmos_12S_sample_sums %>% mutate(.,station = substr(kosmos_12S_sample_sums$sample_name,nchar(kosmos_12S_sample_sums$sample_name)-1,nchar(kosmos_12S_sample_sums$sample_name))) -> kosmos_12S_sample_sums

kosmos_12S_sample_sums$sample_name <- factor(kosmos_12S_sample_sums$sample_name, levels = c("D1_M1",  "D1_MP", "D8_M1",  "D8_MP",  "D15_M1", "D15_MP", "D24_M1", "D24_MP", "D32_M1", "D32_MP", "D36_M1", "D36_MP", "D42_M1", "D42_MP", "D48_M1", "D48_MP"))

kosmos_12S_sample_sums_gg <- ggplot(kosmos_12S_sample_sums,aes(x = sample_name,y = total_reads,fill = station))+
  geom_bar(stat = "identity")+
    ggtitle("12S reads per sample")+
  theme(axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 15,angle = 90),
        axis.title = element_text(size = 18),
        plot.title = element_text(size = 25))+
  ylab("Total Reads")+
  xlab("Sample")

kosmos_12S_sample_sums_gg

ggsave(here::here("figures", "12S", "12S_sample_sums.png"),kosmos_12S_sample_sums_gg,width = 10, height = 6)

All samples

kosmos_12S_tax_table <- as.data.frame(as(tax_table(kosmos_12S_16_phyloseq),"matrix"))
kosmos_12S_tax_table <- rownames_to_column(kosmos_12S_tax_table,"ASV")
kosmos_12S_tax_table %>% mutate_all(as.character) -> kosmos_12S_tax_table

# Sum of reads by ASV
kosmos_12S_taxa_sums <- as.data.frame(taxa_sums(kosmos_12S_16_phyloseq))
kosmos_12S_taxa_sums <- rownames_to_column(kosmos_12S_taxa_sums,"ASV")
colnames(kosmos_12S_taxa_sums)[2] <- "total_reads"

kosmos_12S_taxa_sums <- left_join(kosmos_12S_taxa_sums, kosmos_12S_tax_table, by = "ASV")
kosmos_12S_taxa_sums <- kosmos_12S_taxa_sums[order(-kosmos_12S_taxa_sums$total_reads),]

# Let's group by taxonomy and sum
dplyr::select(kosmos_12S_taxa_sums,-ASV) %>% group_by(Kingdom,Phylum, Class,Order, Family, Genus, Species) %>% summarise_all(funs(sum)) -> kosmos_12S_taxa_sums_grouped 
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
kosmos_12S_taxa_sums_grouped <- kosmos_12S_taxa_sums_grouped[order(-kosmos_12S_taxa_sums_grouped$total_reads),]

write.csv(kosmos_12S_taxa_sums_grouped,here::here("figures", "12S", "kosmos_12S_taxa_sums.csv"))

Prep data for barplot

kosmos_12S_16_phyloseq_chordata <- subset_taxa(kosmos_12S_16_phyloseq, Phylum == "Chordata")

# Remove Paralichthys reads
kosmos_12S_16_phyloseq_chordata <- subset_taxa(kosmos_12S_16_phyloseq_chordata, Genus != "Paralichthys")

kosmos_12S_16_phyloseq_chordata_merged <- tax_glom(kosmos_12S_16_phyloseq_chordata,taxrank = rank_names(kosmos_12S_16_phyloseq_chordata)[7],NArm = FALSE)

kosmos_12S_chordata_merged_asv_table <- as.data.frame(otu_table(kosmos_12S_16_phyloseq_chordata_merged))
kosmos_12S_chordata_merged_asv_table <- rownames_to_column(kosmos_12S_chordata_merged_asv_table,"ASV")
kosmos_12S_chordata_merged_tax_table <- as.data.frame(as(tax_table(kosmos_12S_16_phyloseq_chordata_merged),"matrix"))
kosmos_12S_chordata_merged_tax_table <- rownames_to_column(kosmos_12S_chordata_merged_tax_table,"ASV")
kosmos_12S_chordata_merged_tax_table %>% mutate_all(as.character) -> kosmos_12S_chordata_merged_tax_table

kosmos_12S_chordata_merged_combined_table <- left_join(kosmos_12S_chordata_merged_asv_table,kosmos_12S_chordata_merged_tax_table,by = "ASV")

# Create a new taxonomy field, with best taxonomy (family or below)
kosmos_12S_chordata_merged_combined_table %>% mutate(.,taxonomy = ifelse(!(Species %in% c("s_","unassigned")),Species,
                                                                         ifelse(!(Genus %in% c("g_","unassigned")),Genus,
                                                                                ifelse(!(Family %in% c("unassigned")),Family,
                                                                                       ifelse(!(Order %in% c("unassigned")),Order,"unassigned"))))) -> kosmos_12S_chordata_merged_combined_table

# Sum of reads by taxonomy
kosmos_12S_chordata_merged_taxa_sums <- as.data.frame(taxa_sums(kosmos_12S_16_phyloseq_chordata_merged))
kosmos_12S_chordata_merged_taxa_sums <- rownames_to_column(kosmos_12S_chordata_merged_taxa_sums,"ASV")
colnames(kosmos_12S_chordata_merged_taxa_sums)[2] <- "total_reads"
kosmos_12S_chordata_merged_taxa_sums <- kosmos_12S_chordata_merged_taxa_sums[order(-kosmos_12S_chordata_merged_taxa_sums$total_reads),]
kosmos_12S_chordata_merged_taxa_sums <- left_join(kosmos_12S_chordata_merged_taxa_sums, kosmos_12S_chordata_merged_tax_table, by = "ASV")
# Determine top 19 taxa, so the rest can be "other
top19_kosmos_12S_taxa <- kosmos_12S_chordata_merged_taxa_sums$ASV[1:19]

kosmos_12S_chordata_merged_combined_table %>% mutate(.,taxonomy_updated = ifelse(ASV %in% top19_kosmos_12S_taxa,taxonomy,"other")) -> kosmos_12S_chordata_merged_combined_table 

# Convert to long form
gathercols <- colnames(dplyr::select(kosmos_12S_chordata_merged_asv_table,-ASV))
kosmos_12S_chordata_merged_combined_table_long <- gather(kosmos_12S_chordata_merged_combined_table,key = "sample",value = "reads",gathercols,factor_key = TRUE)

# Shorten sample names
kosmos_12S_chordata_merged_combined_table_long$sample <- as.character(kosmos_12S_chordata_merged_combined_table_long$sample)
kosmos_12S_chordata_merged_combined_table_long %>% mutate(.,sample_name = substr(gsub("KOSMOS","",sample),1,nchar(gsub("KOSMOS","",sample))-5)) -> kosmos_12S_chordata_merged_combined_table_long
kosmos_12S_chordata_merged_combined_table_long %>% mutate(.,station = substr(kosmos_12S_chordata_merged_combined_table_long$sample_name,nchar(kosmos_12S_chordata_merged_combined_table_long$sample_name)-1,nchar(kosmos_12S_chordata_merged_combined_table_long$sample_name))) -> kosmos_12S_chordata_merged_combined_table_long

kosmos_12S_chordata_merged_combined_table_long$sample_name <- factor(kosmos_12S_chordata_merged_combined_table_long$sample_name, levels = c("D1_M1",  "D1_MP", "D8_M1",  "D8_MP",  "D15_M1", "D15_MP", "D24_M1", "D24_MP", "D32_M1", "D32_MP", "D36_M1", "D36_MP", "D42_M1", "D42_MP", "D48_M1", "D48_MP"))

# Create a new data frame with total reads for each sample in order to get one border around each bar
dplyr::select(kosmos_12S_chordata_merged_combined_table_long,-c(ASV,Kingdom,Phylum, Class,Order, Family, Genus, Species,taxonomy,taxonomy_updated,sample,station)) %>% group_by(sample_name) %>% summarise_all(funs(sum)) -> kosmos_12S_chordata_sample_sums
colnames(kosmos_12S_chordata_sample_sums)[2] <- "total_reads_per_sample"
# Add these sample sums to original data frame
kosmos_12S_chordata_merged_combined_table_long <- left_join(kosmos_12S_chordata_merged_combined_table_long,kosmos_12S_chordata_sample_sums,by = "sample_name")

# Make a day column
kosmos_12S_chordata_merged_combined_table_long %>% mutate(.,day = gsub("_.*","",sample_name)) -> kosmos_12S_chordata_merged_combined_table_long
kosmos_12S_chordata_merged_combined_table_long$day <- factor(kosmos_12S_chordata_merged_combined_table_long$day,levels = c("D1","D8","D15","D24","D32","D36","D42","D48"))

# Re-order taxonomy_updated, so that other is last
kosmos_12S_chordata_merged_combined_table_long$taxonomy_updated = factor(kosmos_12S_chordata_merged_combined_table_long$taxonomy_updated, levels = c(sort(unique(kosmos_12S_chordata_merged_combined_table_long$taxonomy_updated)[!unique(kosmos_12S_chordata_merged_combined_table_long$taxonomy_updated) == "other"]),"other"))

Plot at genus level

# Determine top 10 genera
# dplyr::select(kosmos_M1_12S_taxa_sums,-ASV) %>% group_by(Kingdom,Phylum, Class,Order, Family, Genus, Species) %>% summarise_all(funs(sum)) -> kosmos_M1_12S_taxa_sums_grouped 
kosmos_12S_chordata_merged_combined_table_long %>% mutate(.,taxonomy_genus = ifelse(!(Genus %in% c("g_","unassigned")),Genus,
                                                                                ifelse(!(Family %in% c("unassigned")),paste0("Family ",Family),"unassigned"))) -> kosmos_12S_chordata_merged_combined_table_long

kosmos_12S_chordata_merged_taxa_sums %>% mutate(.,taxonomy_genus = ifelse(!(Genus %in% c("g_","unassigned")),Genus,
                                                                                ifelse(!(Family %in% c("unassigned")),paste0("Family ",Family),"unassigned"))) -> kosmos_12S_chordata_merged_taxa_sums

dplyr::select(kosmos_12S_chordata_merged_taxa_sums,-c(ASV,Kingdom,Phylum, Class,Order, Genus, Family, Species)) %>% group_by(taxonomy_genus) %>% summarise_all(funs(sum)) -> kosmos_12S_chordata_genus_sums
kosmos_12S_chordata_genus_sums <- kosmos_12S_chordata_genus_sums[order(-kosmos_12S_chordata_genus_sums$total_reads),]

top12_genera <- kosmos_12S_chordata_genus_sums$taxonomy_genus[1:11]

kosmos_12S_chordata_merged_combined_table_long %>% mutate(.,genus_updated = ifelse(taxonomy_genus %in% top12_genera,taxonomy_genus,"other")) -> kosmos_12S_chordata_merged_combined_table_long

kosmos_12S_chordata_merged_combined_table_long$genus_updated <- factor(kosmos_12S_chordata_merged_combined_table_long$genus_updated, levels = c("Engraulis", "Family Sciaenidae", "Family Engraulidae", "Odontesthes", "Family Haemulidae",  "Ethmidium", "Bairdiella", "Citharichthys", "Sardinops", "Scomber", "Mugil","other", "unassigned"))

Plot M1 separately

Load in the images of Inca Tern and Brown Pelican from phylopic

inca_tern <- readPNG(here::here("figures", "phylopic", "inca_tern.png"))
inca_tern_img <- rasterGrob(inca_tern, interpolate=TRUE)

pelican <- readPNG(here::here("figures", "phylopic", "pelican.png"))
pelican_img <- rasterGrob(pelican, interpolate=TRUE)

Make the plot

kosmos_12S_barplot <- ggplot(subset(kosmos_12S_chordata_merged_combined_table_long,station == "M1"), aes(x = day, y = reads,fill = genus_updated))+
  geom_bar(stat = "identity")+
  scale_fill_tableau(palette = "Tableau 20")+
  ylab("Total Reads")+
  xlab("Mesocosm Sample")+
  guides(color=FALSE,fill=guide_legend(title="Genus"))+
    theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank(),
        legend.position = "none")+
    scale_y_continuous(expand = c(0, 0),lim = c(0,22000))+
      annotate("text",  x=-Inf, y = Inf, label = "(b)", vjust=1.5, hjust=-0.25,size = 12)+
  # Add bird pictures from phylopic
    annotation_custom(pelican_img, xmin=5.5, xmax=6.5, ymin= 18030, ymax=21030) + 
    annotation_custom(inca_tern_img, xmin=6.5, xmax=7.5, ymin= 6228, ymax=9228) + 
    annotation_custom(inca_tern_img, xmin=7.5, xmax=8.5, ymin= 16916, ymax=19916)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
kosmos_12S_barplot

ggsave(here::here("figures", "12S", "12S_mesocosm_barplot_no_paralichthys_genus_plus_birds_090220.png"),kosmos_12S_barplot,width = 8, height = 3.5)

Plot MP separately

kosmos_12S_barplot <- ggplot(subset(kosmos_12S_chordata_merged_combined_table_long,station == "MP"), aes(x = day, y = reads,fill = genus_updated))+
  geom_bar(stat = "identity")+
  scale_fill_tableau(palette = "Tableau 20")+
  ylab("Total Reads")+
  xlab("Pacific Sample")+
  guides(color=FALSE,fill=guide_legend(title="Genus"))+
    theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank(),
        legend.position = "none")+
      scale_y_continuous(expand = c(0, 0),lim = c(0,22000))+
      annotate("text",  x=-Inf, y = Inf, label = "(a)", vjust=1.5, hjust=-0.25,size = 12)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
kosmos_12S_barplot

ggsave(here::here("figures", "12S", "12S_pacific_barplot_noparalichthys_genus.png"),kosmos_12S_barplot,width = 8, height = 3.5)

Create a legend

fakedata <- rep(0.05,12)
fakesample <- rep("nosample",12)
legend_data <- data.frame(allgroups,fakedata,fakesample)

legend_data$allgroups <- factor(unique(kosmos_12S_chordata_merged_combined_table_long$genus_updated), levels = c("Engraulis", "Family Sciaenidae", "Family Engraulidae", "Odontesthes", "Family Haemulidae",  "Ethmidium", "Bairdiella", "Citharichthys", "Sardinops", "Scomber", "Mugil","other", "unassigned"))

legend_gg<-ggplot(legend_data, aes(x=fakesample,y=fakedata,fill=allgroups))+
    geom_bar(stat='identity')+theme_minimal()+
  # scale_fill_tableau(palette = "Tableau 20")+
  scale_fill_tableau(palette = "Tableau 20",
                     labels = c(expression(italic("Engraulis")), "Family Sciaenidae", "Family Engraulidae", expression(italic("Odontesthes")), "Family Haemulidae",  expression(italic("Ethmidium")), expression(italic("Bairdiella")), expression(italic("Citharichthys")), expression(italic("Sardinops")), expression(italic("Scomber")), "other", "unassigned"))+
    theme(axis.text.x = element_blank(),
            legend.position = "right",
          axis.title=element_text(size=8),
          plot.margin = unit(c(0,5,0,0), "pt"),
          legend.text=element_text(size=10),
          legend.title = element_blank(),
          legend.text.align = 0
    )+
    guides(fill=guide_legend(ncol=4))+
    xlab("Sample")+
  ylab("Proportion of Reads")

legend_gg

ggsave(here::here("figures", "12S", "12S_genus_barplot_legend.png"),legend_gg,width = 8, height = 2)

Plot Paralichthys in M1

kosmos_12S_M1_genus_phyloseq <- tax_glom(kosmos_12S_M1_phyloseq,taxrank=rank_names(kosmos_12S_M1_phyloseq)[6], NArm = FALSE)

kosmos_12S_M1_genus_asv_table <- as.data.frame(as(otu_table(kosmos_12S_M1_genus_phyloseq),"matrix"))
kosmos_12S_M1_genus_asv_table <- rownames_to_column(kosmos_12S_M1_genus_asv_table,"ASV")
kosmos_12S_M1_genus_tax_table <- as.data.frame(as(tax_table(kosmos_12S_M1_genus_phyloseq),"matrix"))
kosmos_12S_M1_genus_tax_table <- rownames_to_column(kosmos_12S_M1_genus_tax_table,"ASV")

kosmos_12S_M1_genus_combined_table <- left_join(kosmos_12S_M1_genus_asv_table, kosmos_12S_M1_genus_tax_table, by = "ASV")

# Convert to long form
gathercols <- colnames(dplyr::select(kosmos_12S_M1_genus_asv_table,-ASV))
kosmos_12S_M1_genus_combined_table_gather <- gather(kosmos_12S_M1_genus_combined_table, key = "sample", value = "reads", gathercols, factor_key=TRUE)

kosmos_12S_M1_genus_combined_table_gather$sample <- as.character(kosmos_12S_M1_genus_combined_table_gather$sample)
kosmos_12S_M1_genus_combined_table_gather %>% mutate(.,day = gsub("_.*","", gsub("KOSMOSD","",sample))) -> kosmos_12S_M1_genus_combined_table_gather

paralichthys_M1_long <- subset(kosmos_12S_M1_genus_combined_table_gather, Genus == "Paralichthys")
paralichthys_M1_long$sample <- factor(paralichthys_M1_long$sample, levels = c("KOSMOSD1_M1BS", "KOSMOSD8_M1AS",  "KOSMOSD15_M1AS", "KOSMOSD24_M1AS", "KOSMOSD32_M1AS", "KOSMOSD36_M1AS", "KOSMOSD42_M1AS", "KOSMOSD48_M1AS"))

# paralichthys_M1_long$day <- factor(paralichthys_M1_long$day,levels = c("1","8","15","24","32","36","42","48"))
paralichthys_M1_long$day <- as.numeric(as.character(paralichthys_M1_long$day))
paralichthys_M1_long %>% mutate(.,sample_name = paste0("D",day)) -> paralichthys_M1_long
paralichthys_M1_long$sample_name <- factor(paralichthys_M1_long$sample_name,levels = c("D1","D8","D15","D24","D32","D36","D42","D48"))

paralichthys_M1_gg <- ggplot(paralichthys_M1_long, aes(x = sample_name, y = reads,fill = Genus))+
  geom_bar(stat = 'identity',width=0.35)+
  xlab("Mesocosm Sample")+
  ylab("Total Reads")+
  geom_vline(xintercept=4.75,color="#e41a1c",lty=2,size = 1.2)+
  scale_fill_manual(values = c("black"))+
  scale_y_continuous(expand = c(0, 0),lim = c(0,8500))+
  # annotate(geom = "text",x = 4.65, y = 4250,angle = 90, color = "#e41a1c", size = 2, label = expression(paste(italic("Paralichthys adspersus")," eggs added on day 31")))+
  annotate(geom = "text",x = 4.38, y = 4250,angle = 90, color = "#e41a1c", size = 6, label = expression(paste(italic("Paralichthys adspersus"))))+
  annotate(geom = "text",x = 4.57, y = 4250,angle = 90, color = "#e41a1c", size = 6, label = "eggs added on day 31")+
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        # legend.position = "none",
        axis.text = element_text(size = 15),
        axis.ticks.length.y = unit(0.25,"cm"),
        axis.title = element_text(size = 20),
        panel.background = element_rect(fill = "white",color = "white",size = 1),
        panel.grid.major.x = element_blank(), 
        # panel.grid.major.y = element_line(size = 1, color = "gray70"),
        panel.grid.minor = element_blank(),
        legend.position = "none")+
      annotate("text",  x=-Inf, y = Inf, label = "(c)", vjust=1.5, hjust=-0.25,size = 12)+
  annotate("rect",xmin = 1, xmax = 1.35, ymin = 5500, ymax = 6250,fill = "black",color = "black")+
  annotate("text", x = 1.45, y = 5875, label = "Paralichthys",hjust = 0,size = 6)
  
paralichthys_M1_gg
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

ggsave(here::here("figures", "12S", "paralichthys_M1.png"),paralichthys_M1_gg,width = 8, height = 4)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'